IMPORTING LIBRARIES & DATA¶

In [5]:
import warnings
warnings.filterwarnings('ignore')
In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [8]:
pd.set_option('display.max_columns', None)
In [9]:
app_data = pd.read_csv('application_data.csv.zip')
prev_data = pd.read_csv('previous_application.csv.zip')
In [10]:
print("Application Data Shape:", app_data.shape)
print("Previous Application Data Shape:", prev_data.shape)
Application Data Shape: (307511, 122)
Previous Application Data Shape: (1670214, 37)
In [11]:
print(app_data.info())
print(app_data.head())
print(app_data.describe())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB
None
   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
0      100002       1         Cash loans           M            N   
1      100003       0         Cash loans           F            N   
2      100004       0    Revolving loans           M            Y   
3      100006       0         Cash loans           F            N   
4      100007       0         Cash loans           M            N   

  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0               Y             0          202500.0    406597.5      24700.5   
1               N             0          270000.0   1293502.5      35698.5   
2               Y             0           67500.0    135000.0       6750.0   
3               Y             0          135000.0    312682.5      29686.5   
4               Y             0          121500.0    513000.0      21865.5   

   AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE  \
0         351000.0   Unaccompanied          Working   
1        1129500.0          Family    State servant   
2         135000.0   Unaccompanied          Working   
3         297000.0   Unaccompanied          Working   
4         513000.0   Unaccompanied          Working   

             NAME_EDUCATION_TYPE    NAME_FAMILY_STATUS  NAME_HOUSING_TYPE  \
0  Secondary / secondary special  Single / not married  House / apartment   
1               Higher education               Married  House / apartment   
2  Secondary / secondary special  Single / not married  House / apartment   
3  Secondary / secondary special        Civil marriage  House / apartment   
4  Secondary / secondary special  Single / not married  House / apartment   

   REGION_POPULATION_RELATIVE  DAYS_BIRTH  DAYS_EMPLOYED  DAYS_REGISTRATION  \
0                    0.018801       -9461           -637            -3648.0   
1                    0.003541      -16765          -1188            -1186.0   
2                    0.010032      -19046           -225            -4260.0   
3                    0.008019      -19005          -3039            -9833.0   
4                    0.028663      -19932          -3038            -4311.0   

   DAYS_ID_PUBLISH  OWN_CAR_AGE  FLAG_MOBIL  FLAG_EMP_PHONE  FLAG_WORK_PHONE  \
0            -2120          NaN           1               1                0   
1             -291          NaN           1               1                0   
2            -2531         26.0           1               1                1   
3            -2437          NaN           1               1                0   
4            -3458          NaN           1               1                0   

   FLAG_CONT_MOBILE  FLAG_PHONE  FLAG_EMAIL OCCUPATION_TYPE  CNT_FAM_MEMBERS  \
0                 1           1           0        Laborers              1.0   
1                 1           1           0      Core staff              2.0   
2                 1           1           0        Laborers              1.0   
3                 1           0           0        Laborers              2.0   
4                 1           0           0      Core staff              1.0   

   REGION_RATING_CLIENT  REGION_RATING_CLIENT_W_CITY  \
0                     2                            2   
1                     1                            1   
2                     2                            2   
3                     2                            2   
4                     2                            2   

  WEEKDAY_APPR_PROCESS_START  HOUR_APPR_PROCESS_START  \
0                  WEDNESDAY                       10   
1                     MONDAY                       11   
2                     MONDAY                        9   
3                  WEDNESDAY                       17   
4                   THURSDAY                       11   

   REG_REGION_NOT_LIVE_REGION  REG_REGION_NOT_WORK_REGION  \
0                           0                           0   
1                           0                           0   
2                           0                           0   
3                           0                           0   
4                           0                           0   

   LIVE_REGION_NOT_WORK_REGION  REG_CITY_NOT_LIVE_CITY  \
0                            0                       0   
1                            0                       0   
2                            0                       0   
3                            0                       0   
4                            0                       0   

   REG_CITY_NOT_WORK_CITY  LIVE_CITY_NOT_WORK_CITY       ORGANIZATION_TYPE  \
0                       0                        0  Business Entity Type 3   
1                       0                        0                  School   
2                       0                        0              Government   
3                       0                        0  Business Entity Type 3   
4                       1                        1                Religion   

   EXT_SOURCE_1  EXT_SOURCE_2  EXT_SOURCE_3  APARTMENTS_AVG  BASEMENTAREA_AVG  \
0      0.083037      0.262949      0.139376          0.0247            0.0369   
1      0.311267      0.622246           NaN          0.0959            0.0529   
2           NaN      0.555912      0.729567             NaN               NaN   
3           NaN      0.650442           NaN             NaN               NaN   
4           NaN      0.322738           NaN             NaN               NaN   

   YEARS_BEGINEXPLUATATION_AVG  YEARS_BUILD_AVG  COMMONAREA_AVG  \
0                       0.9722           0.6192          0.0143   
1                       0.9851           0.7960          0.0605   
2                          NaN              NaN             NaN   
3                          NaN              NaN             NaN   
4                          NaN              NaN             NaN   

   ELEVATORS_AVG  ENTRANCES_AVG  FLOORSMAX_AVG  FLOORSMIN_AVG  LANDAREA_AVG  \
0           0.00         0.0690         0.0833         0.1250        0.0369   
1           0.08         0.0345         0.2917         0.3333        0.0130   
2            NaN            NaN            NaN            NaN           NaN   
3            NaN            NaN            NaN            NaN           NaN   
4            NaN            NaN            NaN            NaN           NaN   

   LIVINGAPARTMENTS_AVG  LIVINGAREA_AVG  NONLIVINGAPARTMENTS_AVG  \
0                0.0202          0.0190                   0.0000   
1                0.0773          0.0549                   0.0039   
2                   NaN             NaN                      NaN   
3                   NaN             NaN                      NaN   
4                   NaN             NaN                      NaN   

   NONLIVINGAREA_AVG  APARTMENTS_MODE  BASEMENTAREA_MODE  \
0             0.0000           0.0252             0.0383   
1             0.0098           0.0924             0.0538   
2                NaN              NaN                NaN   
3                NaN              NaN                NaN   
4                NaN              NaN                NaN   

   YEARS_BEGINEXPLUATATION_MODE  YEARS_BUILD_MODE  COMMONAREA_MODE  \
0                        0.9722            0.6341           0.0144   
1                        0.9851            0.8040           0.0497   
2                           NaN               NaN              NaN   
3                           NaN               NaN              NaN   
4                           NaN               NaN              NaN   

   ELEVATORS_MODE  ENTRANCES_MODE  FLOORSMAX_MODE  FLOORSMIN_MODE  \
0          0.0000          0.0690          0.0833          0.1250   
1          0.0806          0.0345          0.2917          0.3333   
2             NaN             NaN             NaN             NaN   
3             NaN             NaN             NaN             NaN   
4             NaN             NaN             NaN             NaN   

   LANDAREA_MODE  LIVINGAPARTMENTS_MODE  LIVINGAREA_MODE  \
0         0.0377                  0.022           0.0198   
1         0.0128                  0.079           0.0554   
2            NaN                    NaN              NaN   
3            NaN                    NaN              NaN   
4            NaN                    NaN              NaN   

   NONLIVINGAPARTMENTS_MODE  NONLIVINGAREA_MODE  APARTMENTS_MEDI  \
0                       0.0                 0.0           0.0250   
1                       0.0                 0.0           0.0968   
2                       NaN                 NaN              NaN   
3                       NaN                 NaN              NaN   
4                       NaN                 NaN              NaN   

   BASEMENTAREA_MEDI  YEARS_BEGINEXPLUATATION_MEDI  YEARS_BUILD_MEDI  \
0             0.0369                        0.9722            0.6243   
1             0.0529                        0.9851            0.7987   
2                NaN                           NaN               NaN   
3                NaN                           NaN               NaN   
4                NaN                           NaN               NaN   

   COMMONAREA_MEDI  ELEVATORS_MEDI  ENTRANCES_MEDI  FLOORSMAX_MEDI  \
0           0.0144            0.00          0.0690          0.0833   
1           0.0608            0.08          0.0345          0.2917   
2              NaN             NaN             NaN             NaN   
3              NaN             NaN             NaN             NaN   
4              NaN             NaN             NaN             NaN   

   FLOORSMIN_MEDI  LANDAREA_MEDI  LIVINGAPARTMENTS_MEDI  LIVINGAREA_MEDI  \
0          0.1250         0.0375                 0.0205           0.0193   
1          0.3333         0.0132                 0.0787           0.0558   
2             NaN            NaN                    NaN              NaN   
3             NaN            NaN                    NaN              NaN   
4             NaN            NaN                    NaN              NaN   

   NONLIVINGAPARTMENTS_MEDI  NONLIVINGAREA_MEDI FONDKAPREMONT_MODE  \
0                    0.0000                0.00   reg oper account   
1                    0.0039                0.01   reg oper account   
2                       NaN                 NaN                NaN   
3                       NaN                 NaN                NaN   
4                       NaN                 NaN                NaN   

   HOUSETYPE_MODE  TOTALAREA_MODE WALLSMATERIAL_MODE EMERGENCYSTATE_MODE  \
0  block of flats          0.0149       Stone, brick                  No   
1  block of flats          0.0714              Block                  No   
2             NaN             NaN                NaN                 NaN   
3             NaN             NaN                NaN                 NaN   
4             NaN             NaN                NaN                 NaN   

   OBS_30_CNT_SOCIAL_CIRCLE  DEF_30_CNT_SOCIAL_CIRCLE  \
0                       2.0                       2.0   
1                       1.0                       0.0   
2                       0.0                       0.0   
3                       2.0                       0.0   
4                       0.0                       0.0   

   OBS_60_CNT_SOCIAL_CIRCLE  DEF_60_CNT_SOCIAL_CIRCLE  DAYS_LAST_PHONE_CHANGE  \
0                       2.0                       2.0                 -1134.0   
1                       1.0                       0.0                  -828.0   
2                       0.0                       0.0                  -815.0   
3                       2.0                       0.0                  -617.0   
4                       0.0                       0.0                 -1106.0   

   FLAG_DOCUMENT_2  FLAG_DOCUMENT_3  FLAG_DOCUMENT_4  FLAG_DOCUMENT_5  \
0                0                1                0                0   
1                0                1                0                0   
2                0                0                0                0   
3                0                1                0                0   
4                0                0                0                0   

   FLAG_DOCUMENT_6  FLAG_DOCUMENT_7  FLAG_DOCUMENT_8  FLAG_DOCUMENT_9  \
0                0                0                0                0   
1                0                0                0                0   
2                0                0                0                0   
3                0                0                0                0   
4                0                0                1                0   

   FLAG_DOCUMENT_10  FLAG_DOCUMENT_11  FLAG_DOCUMENT_12  FLAG_DOCUMENT_13  \
0                 0                 0                 0                 0   
1                 0                 0                 0                 0   
2                 0                 0                 0                 0   
3                 0                 0                 0                 0   
4                 0                 0                 0                 0   

   FLAG_DOCUMENT_14  FLAG_DOCUMENT_15  FLAG_DOCUMENT_16  FLAG_DOCUMENT_17  \
0                 0                 0                 0                 0   
1                 0                 0                 0                 0   
2                 0                 0                 0                 0   
3                 0                 0                 0                 0   
4                 0                 0                 0                 0   

   FLAG_DOCUMENT_18  FLAG_DOCUMENT_19  FLAG_DOCUMENT_20  FLAG_DOCUMENT_21  \
0                 0                 0                 0                 0   
1                 0                 0                 0                 0   
2                 0                 0                 0                 0   
3                 0                 0                 0                 0   
4                 0                 0                 0                 0   

   AMT_REQ_CREDIT_BUREAU_HOUR  AMT_REQ_CREDIT_BUREAU_DAY  \
0                         0.0                        0.0   
1                         0.0                        0.0   
2                         0.0                        0.0   
3                         NaN                        NaN   
4                         0.0                        0.0   

   AMT_REQ_CREDIT_BUREAU_WEEK  AMT_REQ_CREDIT_BUREAU_MON  \
0                         0.0                        0.0   
1                         0.0                        0.0   
2                         0.0                        0.0   
3                         NaN                        NaN   
4                         0.0                        0.0   

   AMT_REQ_CREDIT_BUREAU_QRT  AMT_REQ_CREDIT_BUREAU_YEAR  
0                        0.0                         1.0  
1                        0.0                         0.0  
2                        0.0                         0.0  
3                        NaN                         NaN  
4                        0.0                         0.0  
          SK_ID_CURR         TARGET   CNT_CHILDREN  AMT_INCOME_TOTAL  \
count  307511.000000  307511.000000  307511.000000      3.075110e+05   
mean   278180.518577       0.080729       0.417052      1.687979e+05   
std    102790.175348       0.272419       0.722121      2.371231e+05   
min    100002.000000       0.000000       0.000000      2.565000e+04   
25%    189145.500000       0.000000       0.000000      1.125000e+05   
50%    278202.000000       0.000000       0.000000      1.471500e+05   
75%    367142.500000       0.000000       1.000000      2.025000e+05   
max    456255.000000       1.000000      19.000000      1.170000e+08   

         AMT_CREDIT    AMT_ANNUITY  AMT_GOODS_PRICE  \
count  3.075110e+05  307499.000000     3.072330e+05   
mean   5.990260e+05   27108.573909     5.383962e+05   
std    4.024908e+05   14493.737315     3.694465e+05   
min    4.500000e+04    1615.500000     4.050000e+04   
25%    2.700000e+05   16524.000000     2.385000e+05   
50%    5.135310e+05   24903.000000     4.500000e+05   
75%    8.086500e+05   34596.000000     6.795000e+05   
max    4.050000e+06  258025.500000     4.050000e+06   

       REGION_POPULATION_RELATIVE     DAYS_BIRTH  DAYS_EMPLOYED  \
count               307511.000000  307511.000000  307511.000000   
mean                     0.020868  -16036.995067   63815.045904   
std                      0.013831    4363.988632  141275.766519   
min                      0.000290  -25229.000000  -17912.000000   
25%                      0.010006  -19682.000000   -2760.000000   
50%                      0.018850  -15750.000000   -1213.000000   
75%                      0.028663  -12413.000000    -289.000000   
max                      0.072508   -7489.000000  365243.000000   

       DAYS_REGISTRATION  DAYS_ID_PUBLISH    OWN_CAR_AGE     FLAG_MOBIL  \
count      307511.000000    307511.000000  104582.000000  307511.000000   
mean        -4986.120328     -2994.202373      12.061091       0.999997   
std          3522.886321      1509.450419      11.944812       0.001803   
min        -24672.000000     -7197.000000       0.000000       0.000000   
25%         -7479.500000     -4299.000000       5.000000       1.000000   
50%         -4504.000000     -3254.000000       9.000000       1.000000   
75%         -2010.000000     -1720.000000      15.000000       1.000000   
max             0.000000         0.000000      91.000000       1.000000   

       FLAG_EMP_PHONE  FLAG_WORK_PHONE  FLAG_CONT_MOBILE     FLAG_PHONE  \
count   307511.000000    307511.000000     307511.000000  307511.000000   
mean         0.819889         0.199368          0.998133       0.281066   
std          0.384280         0.399526          0.043164       0.449521   
min          0.000000         0.000000          0.000000       0.000000   
25%          1.000000         0.000000          1.000000       0.000000   
50%          1.000000         0.000000          1.000000       0.000000   
75%          1.000000         0.000000          1.000000       1.000000   
max          1.000000         1.000000          1.000000       1.000000   

          FLAG_EMAIL  CNT_FAM_MEMBERS  REGION_RATING_CLIENT  \
count  307511.000000    307509.000000         307511.000000   
mean        0.056720         2.152665              2.052463   
std         0.231307         0.910682              0.509034   
min         0.000000         1.000000              1.000000   
25%         0.000000         2.000000              2.000000   
50%         0.000000         2.000000              2.000000   
75%         0.000000         3.000000              2.000000   
max         1.000000        20.000000              3.000000   

       REGION_RATING_CLIENT_W_CITY  HOUR_APPR_PROCESS_START  \
count                307511.000000            307511.000000   
mean                      2.031521                12.063419   
std                       0.502737                 3.265832   
min                       1.000000                 0.000000   
25%                       2.000000                10.000000   
50%                       2.000000                12.000000   
75%                       2.000000                14.000000   
max                       3.000000                23.000000   

       REG_REGION_NOT_LIVE_REGION  REG_REGION_NOT_WORK_REGION  \
count               307511.000000               307511.000000   
mean                     0.015144                    0.050769   
std                      0.122126                    0.219526   
min                      0.000000                    0.000000   
25%                      0.000000                    0.000000   
50%                      0.000000                    0.000000   
75%                      0.000000                    0.000000   
max                      1.000000                    1.000000   

       LIVE_REGION_NOT_WORK_REGION  REG_CITY_NOT_LIVE_CITY  \
count                307511.000000           307511.000000   
mean                      0.040659                0.078173   
std                       0.197499                0.268444   
min                       0.000000                0.000000   
25%                       0.000000                0.000000   
50%                       0.000000                0.000000   
75%                       0.000000                0.000000   
max                       1.000000                1.000000   

       REG_CITY_NOT_WORK_CITY  LIVE_CITY_NOT_WORK_CITY   EXT_SOURCE_1  \
count           307511.000000            307511.000000  134133.000000   
mean                 0.230454                 0.179555       0.502130   
std                  0.421124                 0.383817       0.211062   
min                  0.000000                 0.000000       0.014568   
25%                  0.000000                 0.000000       0.334007   
50%                  0.000000                 0.000000       0.505998   
75%                  0.000000                 0.000000       0.675053   
max                  1.000000                 1.000000       0.962693   

       EXT_SOURCE_2   EXT_SOURCE_3  APARTMENTS_AVG  BASEMENTAREA_AVG  \
count  3.068510e+05  246546.000000    151450.00000     127568.000000   
mean   5.143927e-01       0.510853         0.11744          0.088442   
std    1.910602e-01       0.194844         0.10824          0.082438   
min    8.170000e-08       0.000527         0.00000          0.000000   
25%    3.924574e-01       0.370650         0.05770          0.044200   
50%    5.659614e-01       0.535276         0.08760          0.076300   
75%    6.636171e-01       0.669057         0.14850          0.112200   
max    8.549997e-01       0.896010         1.00000          1.000000   

       YEARS_BEGINEXPLUATATION_AVG  YEARS_BUILD_AVG  COMMONAREA_AVG  \
count                157504.000000    103023.000000    92646.000000   
mean                      0.977735         0.752471        0.044621   
std                       0.059223         0.113280        0.076036   
min                       0.000000         0.000000        0.000000   
25%                       0.976700         0.687200        0.007800   
50%                       0.981600         0.755200        0.021100   
75%                       0.986600         0.823200        0.051500   
max                       1.000000         1.000000        1.000000   

       ELEVATORS_AVG  ENTRANCES_AVG  FLOORSMAX_AVG  FLOORSMIN_AVG  \
count  143620.000000  152683.000000  154491.000000   98869.000000   
mean        0.078942       0.149725       0.226282       0.231894   
std         0.134576       0.100049       0.144641       0.161380   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.069000       0.166700       0.083300   
50%         0.000000       0.137900       0.166700       0.208300   
75%         0.120000       0.206900       0.333300       0.375000   
max         1.000000       1.000000       1.000000       1.000000   

        LANDAREA_AVG  LIVINGAPARTMENTS_AVG  LIVINGAREA_AVG  \
count  124921.000000          97312.000000   153161.000000   
mean        0.066333              0.100775        0.107399   
std         0.081184              0.092576        0.110565   
min         0.000000              0.000000        0.000000   
25%         0.018700              0.050400        0.045300   
50%         0.048100              0.075600        0.074500   
75%         0.085600              0.121000        0.129900   
max         1.000000              1.000000        1.000000   

       NONLIVINGAPARTMENTS_AVG  NONLIVINGAREA_AVG  APARTMENTS_MODE  \
count             93997.000000      137829.000000    151450.000000   
mean                  0.008809           0.028358         0.114231   
std                   0.047732           0.069523         0.107936   
min                   0.000000           0.000000         0.000000   
25%                   0.000000           0.000000         0.052500   
50%                   0.000000           0.003600         0.084000   
75%                   0.003900           0.027700         0.143900   
max                   1.000000           1.000000         1.000000   

       BASEMENTAREA_MODE  YEARS_BEGINEXPLUATATION_MODE  YEARS_BUILD_MODE  \
count      127568.000000                 157504.000000     103023.000000   
mean            0.087543                      0.977065          0.759637   
std             0.084307                      0.064575          0.110111   
min             0.000000                      0.000000          0.000000   
25%             0.040700                      0.976700          0.699400   
50%             0.074600                      0.981600          0.764800   
75%             0.112400                      0.986600          0.823600   
max             1.000000                      1.000000          1.000000   

       COMMONAREA_MODE  ELEVATORS_MODE  ENTRANCES_MODE  FLOORSMAX_MODE  \
count     92646.000000   143620.000000   152683.000000   154491.000000   
mean          0.042553        0.074490        0.145193        0.222315   
std           0.074445        0.132256        0.100977        0.143709   
min           0.000000        0.000000        0.000000        0.000000   
25%           0.007200        0.000000        0.069000        0.166700   
50%           0.019000        0.000000        0.137900        0.166700   
75%           0.049000        0.120800        0.206900        0.333300   
max           1.000000        1.000000        1.000000        1.000000   

       FLOORSMIN_MODE  LANDAREA_MODE  LIVINGAPARTMENTS_MODE  LIVINGAREA_MODE  \
count    98869.000000  124921.000000           97312.000000    153161.000000   
mean         0.228058       0.064958               0.105645         0.105975   
std          0.161160       0.081750               0.097880         0.111845   
min          0.000000       0.000000               0.000000         0.000000   
25%          0.083300       0.016600               0.054200         0.042700   
50%          0.208300       0.045800               0.077100         0.073100   
75%          0.375000       0.084100               0.131300         0.125200   
max          1.000000       1.000000               1.000000         1.000000   

       NONLIVINGAPARTMENTS_MODE  NONLIVINGAREA_MODE  APARTMENTS_MEDI  \
count              93997.000000       137829.000000    151450.000000   
mean                   0.008076            0.027022         0.117850   
std                    0.046276            0.070254         0.109076   
min                    0.000000            0.000000         0.000000   
25%                    0.000000            0.000000         0.058300   
50%                    0.000000            0.001100         0.086400   
75%                    0.003900            0.023100         0.148900   
max                    1.000000            1.000000         1.000000   

       BASEMENTAREA_MEDI  YEARS_BEGINEXPLUATATION_MEDI  YEARS_BUILD_MEDI  \
count      127568.000000                 157504.000000     103023.000000   
mean            0.087955                      0.977752          0.755746   
std             0.082179                      0.059897          0.112066   
min             0.000000                      0.000000          0.000000   
25%             0.043700                      0.976700          0.691400   
50%             0.075800                      0.981600          0.758500   
75%             0.111600                      0.986600          0.825600   
max             1.000000                      1.000000          1.000000   

       COMMONAREA_MEDI  ELEVATORS_MEDI  ENTRANCES_MEDI  FLOORSMAX_MEDI  \
count     92646.000000   143620.000000   152683.000000   154491.000000   
mean          0.044595        0.078078        0.149213        0.225897   
std           0.076144        0.134467        0.100368        0.145067   
min           0.000000        0.000000        0.000000        0.000000   
25%           0.007900        0.000000        0.069000        0.166700   
50%           0.020800        0.000000        0.137900        0.166700   
75%           0.051300        0.120000        0.206900        0.333300   
max           1.000000        1.000000        1.000000        1.000000   

       FLOORSMIN_MEDI  LANDAREA_MEDI  LIVINGAPARTMENTS_MEDI  LIVINGAREA_MEDI  \
count    98869.000000  124921.000000           97312.000000    153161.000000   
mean         0.231625       0.067169               0.101954         0.108607   
std          0.161934       0.082167               0.093642         0.112260   
min          0.000000       0.000000               0.000000         0.000000   
25%          0.083300       0.018700               0.051300         0.045700   
50%          0.208300       0.048700               0.076100         0.074900   
75%          0.375000       0.086800               0.123100         0.130300   
max          1.000000       1.000000               1.000000         1.000000   

       NONLIVINGAPARTMENTS_MEDI  NONLIVINGAREA_MEDI  TOTALAREA_MODE  \
count              93997.000000       137829.000000   159080.000000   
mean                   0.008651            0.028236        0.102547   
std                    0.047415            0.070166        0.107462   
min                    0.000000            0.000000        0.000000   
25%                    0.000000            0.000000        0.041200   
50%                    0.000000            0.003100        0.068800   
75%                    0.003900            0.026600        0.127600   
max                    1.000000            1.000000        1.000000   

       OBS_30_CNT_SOCIAL_CIRCLE  DEF_30_CNT_SOCIAL_CIRCLE  \
count             306490.000000             306490.000000   
mean                   1.422245                  0.143421   
std                    2.400989                  0.446698   
min                    0.000000                  0.000000   
25%                    0.000000                  0.000000   
50%                    0.000000                  0.000000   
75%                    2.000000                  0.000000   
max                  348.000000                 34.000000   

       OBS_60_CNT_SOCIAL_CIRCLE  DEF_60_CNT_SOCIAL_CIRCLE  \
count             306490.000000             306490.000000   
mean                   1.405292                  0.100049   
std                    2.379803                  0.362291   
min                    0.000000                  0.000000   
25%                    0.000000                  0.000000   
50%                    0.000000                  0.000000   
75%                    2.000000                  0.000000   
max                  344.000000                 24.000000   

       DAYS_LAST_PHONE_CHANGE  FLAG_DOCUMENT_2  FLAG_DOCUMENT_3  \
count           307510.000000    307511.000000    307511.000000   
mean              -962.858788         0.000042         0.710023   
std                826.808487         0.006502         0.453752   
min              -4292.000000         0.000000         0.000000   
25%              -1570.000000         0.000000         0.000000   
50%               -757.000000         0.000000         1.000000   
75%               -274.000000         0.000000         1.000000   
max                  0.000000         1.000000         1.000000   

       FLAG_DOCUMENT_4  FLAG_DOCUMENT_5  FLAG_DOCUMENT_6  FLAG_DOCUMENT_7  \
count    307511.000000    307511.000000    307511.000000    307511.000000   
mean          0.000081         0.015115         0.088055         0.000192   
std           0.009016         0.122010         0.283376         0.013850   
min           0.000000         0.000000         0.000000         0.000000   
25%           0.000000         0.000000         0.000000         0.000000   
50%           0.000000         0.000000         0.000000         0.000000   
75%           0.000000         0.000000         0.000000         0.000000   
max           1.000000         1.000000         1.000000         1.000000   

       FLAG_DOCUMENT_8  FLAG_DOCUMENT_9  FLAG_DOCUMENT_10  FLAG_DOCUMENT_11  \
count    307511.000000    307511.000000     307511.000000     307511.000000   
mean          0.081376         0.003896          0.000023          0.003912   
std           0.273412         0.062295          0.004771          0.062424   
min           0.000000         0.000000          0.000000          0.000000   
25%           0.000000         0.000000          0.000000          0.000000   
50%           0.000000         0.000000          0.000000          0.000000   
75%           0.000000         0.000000          0.000000          0.000000   
max           1.000000         1.000000          1.000000          1.000000   

       FLAG_DOCUMENT_12  FLAG_DOCUMENT_13  FLAG_DOCUMENT_14  FLAG_DOCUMENT_15  \
count     307511.000000     307511.000000     307511.000000      307511.00000   
mean           0.000007          0.003525          0.002936           0.00121   
std            0.002550          0.059268          0.054110           0.03476   
min            0.000000          0.000000          0.000000           0.00000   
25%            0.000000          0.000000          0.000000           0.00000   
50%            0.000000          0.000000          0.000000           0.00000   
75%            0.000000          0.000000          0.000000           0.00000   
max            1.000000          1.000000          1.000000           1.00000   

       FLAG_DOCUMENT_16  FLAG_DOCUMENT_17  FLAG_DOCUMENT_18  FLAG_DOCUMENT_19  \
count     307511.000000     307511.000000     307511.000000     307511.000000   
mean           0.009928          0.000267          0.008130          0.000595   
std            0.099144          0.016327          0.089798          0.024387   
min            0.000000          0.000000          0.000000          0.000000   
25%            0.000000          0.000000          0.000000          0.000000   
50%            0.000000          0.000000          0.000000          0.000000   
75%            0.000000          0.000000          0.000000          0.000000   
max            1.000000          1.000000          1.000000          1.000000   

       FLAG_DOCUMENT_20  FLAG_DOCUMENT_21  AMT_REQ_CREDIT_BUREAU_HOUR  \
count     307511.000000     307511.000000               265992.000000   
mean           0.000507          0.000335                    0.006402   
std            0.022518          0.018299                    0.083849   
min            0.000000          0.000000                    0.000000   
25%            0.000000          0.000000                    0.000000   
50%            0.000000          0.000000                    0.000000   
75%            0.000000          0.000000                    0.000000   
max            1.000000          1.000000                    4.000000   

       AMT_REQ_CREDIT_BUREAU_DAY  AMT_REQ_CREDIT_BUREAU_WEEK  \
count              265992.000000               265992.000000   
mean                    0.007000                    0.034362   
std                     0.110757                    0.204685   
min                     0.000000                    0.000000   
25%                     0.000000                    0.000000   
50%                     0.000000                    0.000000   
75%                     0.000000                    0.000000   
max                     9.000000                    8.000000   

       AMT_REQ_CREDIT_BUREAU_MON  AMT_REQ_CREDIT_BUREAU_QRT  \
count              265992.000000              265992.000000   
mean                    0.267395                   0.265474   
std                     0.916002                   0.794056   
min                     0.000000                   0.000000   
25%                     0.000000                   0.000000   
50%                     0.000000                   0.000000   
75%                     0.000000                   0.000000   
max                    27.000000                 261.000000   

       AMT_REQ_CREDIT_BUREAU_YEAR  
count               265992.000000  
mean                     1.899974  
std                      1.869295  
min                      0.000000  
25%                      0.000000  
50%                      1.000000  
75%                      3.000000  
max                     25.000000  
In [12]:
print("Missing Values Count:")
print(app_data.isnull().sum().sort_values(ascending=False))
Missing Values Count:
COMMONAREA_MEDI             214865
COMMONAREA_AVG              214865
COMMONAREA_MODE             214865
NONLIVINGAPARTMENTS_MODE    213514
NONLIVINGAPARTMENTS_AVG     213514
                             ...  
NAME_HOUSING_TYPE                0
NAME_FAMILY_STATUS               0
NAME_EDUCATION_TYPE              0
NAME_INCOME_TYPE                 0
SK_ID_CURR                       0
Length: 122, dtype: int64
In [14]:
missing_percent = (app_data.isnull().sum() / len(app_data)) * 100
missing_df = pd.DataFrame({
    'Column': app_data.columns,
    'Missing_Count': app_data.isnull().sum(),
    'Missing_Percentage': missing_percent
}).sort_values('Missing_Percentage', ascending=False)
print(missing_df[missing_df['Missing_Percentage'] > 0])
                                            Column  Missing_Count  \
COMMONAREA_MEDI                    COMMONAREA_MEDI         214865   
COMMONAREA_AVG                      COMMONAREA_AVG         214865   
COMMONAREA_MODE                    COMMONAREA_MODE         214865   
NONLIVINGAPARTMENTS_MODE  NONLIVINGAPARTMENTS_MODE         213514   
NONLIVINGAPARTMENTS_AVG    NONLIVINGAPARTMENTS_AVG         213514   
...                                            ...            ...   
EXT_SOURCE_2                          EXT_SOURCE_2            660   
AMT_GOODS_PRICE                    AMT_GOODS_PRICE            278   
AMT_ANNUITY                            AMT_ANNUITY             12   
CNT_FAM_MEMBERS                    CNT_FAM_MEMBERS              2   
DAYS_LAST_PHONE_CHANGE      DAYS_LAST_PHONE_CHANGE              1   

                          Missing_Percentage  
COMMONAREA_MEDI                    69.872297  
COMMONAREA_AVG                     69.872297  
COMMONAREA_MODE                    69.872297  
NONLIVINGAPARTMENTS_MODE           69.432963  
NONLIVINGAPARTMENTS_AVG            69.432963  
...                                      ...  
EXT_SOURCE_2                        0.214626  
AMT_GOODS_PRICE                     0.090403  
AMT_ANNUITY                         0.003902  
CNT_FAM_MEMBERS                     0.000650  
DAYS_LAST_PHONE_CHANGE              0.000325  

[67 rows x 3 columns]
In [15]:
print(prev_data.info())
print(prev_data.head())
print(prev_data.tail())
print(prev_data.describe())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Data columns (total 37 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   SK_ID_PREV                   1670214 non-null  int64  
 1   SK_ID_CURR                   1670214 non-null  int64  
 2   NAME_CONTRACT_TYPE           1670214 non-null  object 
 3   AMT_ANNUITY                  1297979 non-null  float64
 4   AMT_APPLICATION              1670214 non-null  float64
 5   AMT_CREDIT                   1670213 non-null  float64
 6   AMT_DOWN_PAYMENT             774370 non-null   float64
 7   AMT_GOODS_PRICE              1284699 non-null  float64
 8   WEEKDAY_APPR_PROCESS_START   1670214 non-null  object 
 9   HOUR_APPR_PROCESS_START      1670214 non-null  int64  
 10  FLAG_LAST_APPL_PER_CONTRACT  1670214 non-null  object 
 11  NFLAG_LAST_APPL_IN_DAY       1670214 non-null  int64  
 12  RATE_DOWN_PAYMENT            774370 non-null   float64
 13  RATE_INTEREST_PRIMARY        5951 non-null     float64
 14  RATE_INTEREST_PRIVILEGED     5951 non-null     float64
 15  NAME_CASH_LOAN_PURPOSE       1670214 non-null  object 
 16  NAME_CONTRACT_STATUS         1670214 non-null  object 
 17  DAYS_DECISION                1670214 non-null  int64  
 18  NAME_PAYMENT_TYPE            1670214 non-null  object 
 19  CODE_REJECT_REASON           1670214 non-null  object 
 20  NAME_TYPE_SUITE              849809 non-null   object 
 21  NAME_CLIENT_TYPE             1670214 non-null  object 
 22  NAME_GOODS_CATEGORY          1670214 non-null  object 
 23  NAME_PORTFOLIO               1670214 non-null  object 
 24  NAME_PRODUCT_TYPE            1670214 non-null  object 
 25  CHANNEL_TYPE                 1670214 non-null  object 
 26  SELLERPLACE_AREA             1670214 non-null  int64  
 27  NAME_SELLER_INDUSTRY         1670214 non-null  object 
 28  CNT_PAYMENT                  1297984 non-null  float64
 29  NAME_YIELD_GROUP             1670214 non-null  object 
 30  PRODUCT_COMBINATION          1669868 non-null  object 
 31  DAYS_FIRST_DRAWING           997149 non-null   float64
 32  DAYS_FIRST_DUE               997149 non-null   float64
 33  DAYS_LAST_DUE_1ST_VERSION    997149 non-null   float64
 34  DAYS_LAST_DUE                997149 non-null   float64
 35  DAYS_TERMINATION             997149 non-null   float64
 36  NFLAG_INSURED_ON_APPROVAL    997149 non-null   float64
dtypes: float64(15), int64(6), object(16)
memory usage: 471.5+ MB
None
   SK_ID_PREV  SK_ID_CURR NAME_CONTRACT_TYPE  AMT_ANNUITY  AMT_APPLICATION  \
0     2030495      271877     Consumer loans     1730.430          17145.0   
1     2802425      108129         Cash loans    25188.615         607500.0   
2     2523466      122040         Cash loans    15060.735         112500.0   
3     2819243      176158         Cash loans    47041.335         450000.0   
4     1784265      202054         Cash loans    31924.395         337500.0   

   AMT_CREDIT  AMT_DOWN_PAYMENT  AMT_GOODS_PRICE WEEKDAY_APPR_PROCESS_START  \
0     17145.0               0.0          17145.0                   SATURDAY   
1    679671.0               NaN         607500.0                   THURSDAY   
2    136444.5               NaN         112500.0                    TUESDAY   
3    470790.0               NaN         450000.0                     MONDAY   
4    404055.0               NaN         337500.0                   THURSDAY   

   HOUR_APPR_PROCESS_START FLAG_LAST_APPL_PER_CONTRACT  \
0                       15                           Y   
1                       11                           Y   
2                       11                           Y   
3                        7                           Y   
4                        9                           Y   

   NFLAG_LAST_APPL_IN_DAY  RATE_DOWN_PAYMENT  RATE_INTEREST_PRIMARY  \
0                       1                0.0               0.182832   
1                       1                NaN                    NaN   
2                       1                NaN                    NaN   
3                       1                NaN                    NaN   
4                       1                NaN                    NaN   

   RATE_INTEREST_PRIVILEGED NAME_CASH_LOAN_PURPOSE NAME_CONTRACT_STATUS  \
0                  0.867336                    XAP             Approved   
1                       NaN                    XNA             Approved   
2                       NaN                    XNA             Approved   
3                       NaN                    XNA             Approved   
4                       NaN                Repairs              Refused   

   DAYS_DECISION      NAME_PAYMENT_TYPE CODE_REJECT_REASON  NAME_TYPE_SUITE  \
0            -73  Cash through the bank                XAP              NaN   
1           -164                    XNA                XAP    Unaccompanied   
2           -301  Cash through the bank                XAP  Spouse, partner   
3           -512  Cash through the bank                XAP              NaN   
4           -781  Cash through the bank                 HC              NaN   

  NAME_CLIENT_TYPE NAME_GOODS_CATEGORY NAME_PORTFOLIO NAME_PRODUCT_TYPE  \
0         Repeater              Mobile            POS               XNA   
1         Repeater                 XNA           Cash            x-sell   
2         Repeater                 XNA           Cash            x-sell   
3         Repeater                 XNA           Cash            x-sell   
4         Repeater                 XNA           Cash           walk-in   

              CHANNEL_TYPE  SELLERPLACE_AREA NAME_SELLER_INDUSTRY  \
0             Country-wide                35         Connectivity   
1           Contact center                -1                  XNA   
2  Credit and cash offices                -1                  XNA   
3  Credit and cash offices                -1                  XNA   
4  Credit and cash offices                -1                  XNA   

   CNT_PAYMENT NAME_YIELD_GROUP       PRODUCT_COMBINATION  DAYS_FIRST_DRAWING  \
0         12.0           middle  POS mobile with interest            365243.0   
1         36.0       low_action          Cash X-Sell: low            365243.0   
2         12.0             high         Cash X-Sell: high            365243.0   
3         12.0           middle       Cash X-Sell: middle            365243.0   
4         24.0             high         Cash Street: high                 NaN   

   DAYS_FIRST_DUE  DAYS_LAST_DUE_1ST_VERSION  DAYS_LAST_DUE  DAYS_TERMINATION  \
0           -42.0                      300.0          -42.0             -37.0   
1          -134.0                      916.0       365243.0          365243.0   
2          -271.0                       59.0       365243.0          365243.0   
3          -482.0                     -152.0         -182.0            -177.0   
4             NaN                        NaN            NaN               NaN   

   NFLAG_INSURED_ON_APPROVAL  
0                        0.0  
1                        1.0  
2                        1.0  
3                        1.0  
4                        NaN  
         SK_ID_PREV  SK_ID_CURR NAME_CONTRACT_TYPE  AMT_ANNUITY  \
1670209     2300464      352015     Consumer loans    14704.290   
1670210     2357031      334635     Consumer loans     6622.020   
1670211     2659632      249544     Consumer loans    11520.855   
1670212     2785582      400317         Cash loans    18821.520   
1670213     2418762      261212         Cash loans    16431.300   

         AMT_APPLICATION  AMT_CREDIT  AMT_DOWN_PAYMENT  AMT_GOODS_PRICE  \
1670209         267295.5    311400.0               0.0         267295.5   
1670210          87750.0     64291.5           29250.0          87750.0   
1670211         105237.0    102523.5           10525.5         105237.0   
1670212         180000.0    191880.0               NaN         180000.0   
1670213         360000.0    360000.0               NaN         360000.0   

        WEEKDAY_APPR_PROCESS_START  HOUR_APPR_PROCESS_START  \
1670209                  WEDNESDAY                       12   
1670210                    TUESDAY                       15   
1670211                     MONDAY                       12   
1670212                  WEDNESDAY                        9   
1670213                     SUNDAY                       10   

        FLAG_LAST_APPL_PER_CONTRACT  NFLAG_LAST_APPL_IN_DAY  \
1670209                           Y                       1   
1670210                           Y                       1   
1670211                           Y                       1   
1670212                           Y                       1   
1670213                           Y                       1   

         RATE_DOWN_PAYMENT  RATE_INTEREST_PRIMARY  RATE_INTEREST_PRIVILEGED  \
1670209           0.000000                    NaN                       NaN   
1670210           0.340554                    NaN                       NaN   
1670211           0.101401                    NaN                       NaN   
1670212                NaN                    NaN                       NaN   
1670213                NaN                    NaN                       NaN   

        NAME_CASH_LOAN_PURPOSE NAME_CONTRACT_STATUS  DAYS_DECISION  \
1670209                    XAP             Approved           -544   
1670210                    XAP             Approved          -1694   
1670211                    XAP             Approved          -1488   
1670212                    XNA             Approved          -1185   
1670213                    XNA             Approved          -1193   

             NAME_PAYMENT_TYPE CODE_REJECT_REASON  NAME_TYPE_SUITE  \
1670209  Cash through the bank                XAP              NaN   
1670210  Cash through the bank                XAP    Unaccompanied   
1670211  Cash through the bank                XAP  Spouse, partner   
1670212  Cash through the bank                XAP           Family   
1670213  Cash through the bank                XAP           Family   

        NAME_CLIENT_TYPE   NAME_GOODS_CATEGORY NAME_PORTFOLIO  \
1670209        Refreshed             Furniture            POS   
1670210              New             Furniture            POS   
1670211         Repeater  Consumer Electronics            POS   
1670212         Repeater                   XNA           Cash   
1670213         Repeater                   XNA           Cash   

        NAME_PRODUCT_TYPE     CHANNEL_TYPE  SELLERPLACE_AREA  \
1670209               XNA            Stone                43   
1670210               XNA            Stone                43   
1670211               XNA     Country-wide              1370   
1670212            x-sell  AP+ (Cash loan)                -1   
1670213            x-sell  AP+ (Cash loan)                -1   

         NAME_SELLER_INDUSTRY  CNT_PAYMENT NAME_YIELD_GROUP  \
1670209             Furniture         30.0       low_normal   
1670210             Furniture         12.0           middle   
1670211  Consumer electronics         10.0       low_normal   
1670212                   XNA         12.0       low_normal   
1670213                   XNA         48.0           middle   

                 PRODUCT_COMBINATION  DAYS_FIRST_DRAWING  DAYS_FIRST_DUE  \
1670209   POS industry with interest            365243.0          -508.0   
1670210   POS industry with interest            365243.0         -1604.0   
1670211  POS household with interest            365243.0         -1457.0   
1670212             Cash X-Sell: low            365243.0         -1155.0   
1670213          Cash X-Sell: middle            365243.0         -1163.0   

         DAYS_LAST_DUE_1ST_VERSION  DAYS_LAST_DUE  DAYS_TERMINATION  \
1670209                      362.0         -358.0            -351.0   
1670210                    -1274.0        -1304.0           -1297.0   
1670211                    -1187.0        -1187.0           -1181.0   
1670212                     -825.0         -825.0            -817.0   
1670213                      247.0         -443.0            -423.0   

         NFLAG_INSURED_ON_APPROVAL  
1670209                        0.0  
1670210                        0.0  
1670211                        0.0  
1670212                        1.0  
1670213                        0.0  
         SK_ID_PREV    SK_ID_CURR   AMT_ANNUITY  AMT_APPLICATION  \
count  1.670214e+06  1.670214e+06  1.297979e+06     1.670214e+06   
mean   1.923089e+06  2.783572e+05  1.595512e+04     1.752339e+05   
std    5.325980e+05  1.028148e+05  1.478214e+04     2.927798e+05   
min    1.000001e+06  1.000010e+05  0.000000e+00     0.000000e+00   
25%    1.461857e+06  1.893290e+05  6.321780e+03     1.872000e+04   
50%    1.923110e+06  2.787145e+05  1.125000e+04     7.104600e+04   
75%    2.384280e+06  3.675140e+05  2.065842e+04     1.803600e+05   
max    2.845382e+06  4.562550e+05  4.180581e+05     6.905160e+06   

         AMT_CREDIT  AMT_DOWN_PAYMENT  AMT_GOODS_PRICE  \
count  1.670213e+06      7.743700e+05     1.284699e+06   
mean   1.961140e+05      6.697402e+03     2.278473e+05   
std    3.185746e+05      2.092150e+04     3.153966e+05   
min    0.000000e+00     -9.000000e-01     0.000000e+00   
25%    2.416050e+04      0.000000e+00     5.084100e+04   
50%    8.054100e+04      1.638000e+03     1.123200e+05   
75%    2.164185e+05      7.740000e+03     2.340000e+05   
max    6.905160e+06      3.060045e+06     6.905160e+06   

       HOUR_APPR_PROCESS_START  NFLAG_LAST_APPL_IN_DAY  RATE_DOWN_PAYMENT  \
count             1.670214e+06            1.670214e+06      774370.000000   
mean              1.248418e+01            9.964675e-01           0.079637   
std               3.334028e+00            5.932963e-02           0.107823   
min               0.000000e+00            0.000000e+00          -0.000015   
25%               1.000000e+01            1.000000e+00           0.000000   
50%               1.200000e+01            1.000000e+00           0.051605   
75%               1.500000e+01            1.000000e+00           0.108909   
max               2.300000e+01            1.000000e+00           1.000000   

       RATE_INTEREST_PRIMARY  RATE_INTEREST_PRIVILEGED  DAYS_DECISION  \
count            5951.000000               5951.000000   1.670214e+06   
mean                0.188357                  0.773503  -8.806797e+02   
std                 0.087671                  0.100879   7.790997e+02   
min                 0.034781                  0.373150  -2.922000e+03   
25%                 0.160716                  0.715645  -1.300000e+03   
50%                 0.189122                  0.835095  -5.810000e+02   
75%                 0.193330                  0.852537  -2.800000e+02   
max                 1.000000                  1.000000  -1.000000e+00   

       SELLERPLACE_AREA   CNT_PAYMENT  DAYS_FIRST_DRAWING  DAYS_FIRST_DUE  \
count      1.670214e+06  1.297984e+06       997149.000000   997149.000000   
mean       3.139511e+02  1.605408e+01       342209.855039    13826.269337   
std        7.127443e+03  1.456729e+01        88916.115833    72444.869708   
min       -1.000000e+00  0.000000e+00        -2922.000000    -2892.000000   
25%       -1.000000e+00  6.000000e+00       365243.000000    -1628.000000   
50%        3.000000e+00  1.200000e+01       365243.000000     -831.000000   
75%        8.200000e+01  2.400000e+01       365243.000000     -411.000000   
max        4.000000e+06  8.400000e+01       365243.000000   365243.000000   

       DAYS_LAST_DUE_1ST_VERSION  DAYS_LAST_DUE  DAYS_TERMINATION  \
count              997149.000000  997149.000000     997149.000000   
mean                33767.774054   76582.403064      81992.343838   
std                106857.034789  149647.415123     153303.516729   
min                 -2801.000000   -2889.000000      -2874.000000   
25%                 -1242.000000   -1314.000000      -1270.000000   
50%                  -361.000000    -537.000000       -499.000000   
75%                   129.000000     -74.000000        -44.000000   
max                365243.000000  365243.000000     365243.000000   

       NFLAG_INSURED_ON_APPROVAL  
count              997149.000000  
mean                    0.332570  
std                     0.471134  
min                     0.000000  
25%                     0.000000  
50%                     0.000000  
75%                     1.000000  
max                     1.000000  
In [16]:
print("Missing Values Count:")
print(prev_data.isnull().sum().sort_values(ascending=False))
Missing Values Count:
RATE_INTEREST_PRIVILEGED       1664263
RATE_INTEREST_PRIMARY          1664263
AMT_DOWN_PAYMENT                895844
RATE_DOWN_PAYMENT               895844
NAME_TYPE_SUITE                 820405
NFLAG_INSURED_ON_APPROVAL       673065
DAYS_TERMINATION                673065
DAYS_LAST_DUE                   673065
DAYS_LAST_DUE_1ST_VERSION       673065
DAYS_FIRST_DUE                  673065
DAYS_FIRST_DRAWING              673065
AMT_GOODS_PRICE                 385515
AMT_ANNUITY                     372235
CNT_PAYMENT                     372230
PRODUCT_COMBINATION                346
AMT_CREDIT                           1
NAME_YIELD_GROUP                     0
NAME_PORTFOLIO                       0
NAME_SELLER_INDUSTRY                 0
SELLERPLACE_AREA                     0
CHANNEL_TYPE                         0
NAME_PRODUCT_TYPE                    0
SK_ID_PREV                           0
NAME_GOODS_CATEGORY                  0
NAME_CLIENT_TYPE                     0
CODE_REJECT_REASON                   0
SK_ID_CURR                           0
DAYS_DECISION                        0
NAME_CONTRACT_STATUS                 0
NAME_CASH_LOAN_PURPOSE               0
NFLAG_LAST_APPL_IN_DAY               0
FLAG_LAST_APPL_PER_CONTRACT          0
HOUR_APPR_PROCESS_START              0
WEEKDAY_APPR_PROCESS_START           0
AMT_APPLICATION                      0
NAME_CONTRACT_TYPE                   0
NAME_PAYMENT_TYPE                    0
dtype: int64
In [17]:
missing_percent = (prev_data.isnull().sum() / len(app_data)) * 100
missing_df = pd.DataFrame({
    'Column': prev_data.columns,
    'Missing_Count': prev_data.isnull().sum(),
    'Missing_Percentage': missing_percent
}).sort_values('Missing_Percentage', ascending=False)
print(missing_df[missing_df['Missing_Percentage'] > 0])
                                              Column  Missing_Count  \
RATE_INTEREST_PRIVILEGED    RATE_INTEREST_PRIVILEGED        1664263   
RATE_INTEREST_PRIMARY          RATE_INTEREST_PRIMARY        1664263   
AMT_DOWN_PAYMENT                    AMT_DOWN_PAYMENT         895844   
RATE_DOWN_PAYMENT                  RATE_DOWN_PAYMENT         895844   
NAME_TYPE_SUITE                      NAME_TYPE_SUITE         820405   
NFLAG_INSURED_ON_APPROVAL  NFLAG_INSURED_ON_APPROVAL         673065   
DAYS_TERMINATION                    DAYS_TERMINATION         673065   
DAYS_LAST_DUE                          DAYS_LAST_DUE         673065   
DAYS_LAST_DUE_1ST_VERSION  DAYS_LAST_DUE_1ST_VERSION         673065   
DAYS_FIRST_DUE                        DAYS_FIRST_DUE         673065   
DAYS_FIRST_DRAWING                DAYS_FIRST_DRAWING         673065   
AMT_GOODS_PRICE                      AMT_GOODS_PRICE         385515   
AMT_ANNUITY                              AMT_ANNUITY         372235   
CNT_PAYMENT                              CNT_PAYMENT         372230   
PRODUCT_COMBINATION              PRODUCT_COMBINATION            346   
AMT_CREDIT                                AMT_CREDIT              1   

                           Missing_Percentage  
RATE_INTEREST_PRIVILEGED           541.204380  
RATE_INTEREST_PRIMARY              541.204380  
AMT_DOWN_PAYMENT                   291.320961  
RATE_DOWN_PAYMENT                  291.320961  
NAME_TYPE_SUITE                    266.788830  
NFLAG_INSURED_ON_APPROVAL          218.875097  
DAYS_TERMINATION                   218.875097  
DAYS_LAST_DUE                      218.875097  
DAYS_LAST_DUE_1ST_VERSION          218.875097  
DAYS_FIRST_DUE                     218.875097  
DAYS_FIRST_DRAWING                 218.875097  
AMT_GOODS_PRICE                    125.366247  
AMT_ANNUITY                        121.047702  
CNT_PAYMENT                        121.046076  
PRODUCT_COMBINATION                  0.112516  
AMT_CREDIT                           0.000325  
In [18]:
pd.set_option("display.max_rows", 100)
app_data.isnull().mean()*100
Out[18]:
SK_ID_CURR                     0.000000
TARGET                         0.000000
NAME_CONTRACT_TYPE             0.000000
CODE_GENDER                    0.000000
FLAG_OWN_CAR                   0.000000
                                ...    
AMT_REQ_CREDIT_BUREAU_DAY     13.501631
AMT_REQ_CREDIT_BUREAU_WEEK    13.501631
AMT_REQ_CREDIT_BUREAU_MON     13.501631
AMT_REQ_CREDIT_BUREAU_QRT     13.501631
AMT_REQ_CREDIT_BUREAU_YEAR    13.501631
Length: 122, dtype: float64
In [20]:
percentage = 47
threshold = int(((100-percentage)/100)*app_data.shape[0]+1)
app_df = app_data.dropna(axis=1,how = 'any')
app_df.head()
Out[20]:
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21
0 100002 1 Cash loans M N Y 0 202500.0 406597.5 Working Secondary / secondary special Single / not married House / apartment 0.018801 -9461 -637 -3648.0 -2120 1 1 0 1 1 0 2 2 WEDNESDAY 10 0 0 0 0 0 0 Business Entity Type 3 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 100003 0 Cash loans F N N 0 270000.0 1293502.5 State servant Higher education Married House / apartment 0.003541 -16765 -1188 -1186.0 -291 1 1 0 1 1 0 1 1 MONDAY 11 0 0 0 0 0 0 School 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 100004 0 Revolving loans M Y Y 0 67500.0 135000.0 Working Secondary / secondary special Single / not married House / apartment 0.010032 -19046 -225 -4260.0 -2531 1 1 1 1 1 0 2 2 MONDAY 9 0 0 0 0 0 0 Government 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 100006 0 Cash loans F N Y 0 135000.0 312682.5 Working Secondary / secondary special Civil marriage House / apartment 0.008019 -19005 -3039 -9833.0 -2437 1 1 0 1 0 0 2 2 WEDNESDAY 17 0 0 0 0 0 0 Business Entity Type 3 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 100007 0 Cash loans M N Y 0 121500.0 513000.0 Working Secondary / secondary special Single / not married House / apartment 0.028663 -19932 -3038 -4311.0 -3458 1 1 0 1 0 0 2 2 THURSDAY 11 0 0 0 0 1 1 Religion 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
In [25]:
app_df.shape
Out[25]:
(307511, 76)
In [27]:
app_df.isnull().mean()*100
Out[27]:
SK_ID_CURR                     0.0
TARGET                         0.0
NAME_CONTRACT_TYPE             0.0
CODE_GENDER                    0.0
FLAG_OWN_CAR                   0.0
FLAG_OWN_REALTY                0.0
CNT_CHILDREN                   0.0
AMT_INCOME_TOTAL               0.0
AMT_CREDIT                     0.0
AMT_ANNUITY                    0.0
AMT_GOODS_PRICE                0.0
NAME_INCOME_TYPE               0.0
NAME_EDUCATION_TYPE            0.0
NAME_FAMILY_STATUS             0.0
NAME_HOUSING_TYPE              0.0
REGION_POPULATION_RELATIVE     0.0
DAYS_BIRTH                     0.0
DAYS_EMPLOYED                  0.0
DAYS_REGISTRATION              0.0
DAYS_ID_PUBLISH                0.0
FLAG_MOBIL                     0.0
FLAG_EMP_PHONE                 0.0
FLAG_WORK_PHONE                0.0
FLAG_CONT_MOBILE               0.0
FLAG_PHONE                     0.0
FLAG_EMAIL                     0.0
CNT_FAM_MEMBERS                0.0
REGION_RATING_CLIENT           0.0
REGION_RATING_CLIENT_W_CITY    0.0
WEEKDAY_APPR_PROCESS_START     0.0
HOUR_APPR_PROCESS_START        0.0
REG_REGION_NOT_LIVE_REGION     0.0
REG_REGION_NOT_WORK_REGION     0.0
LIVE_REGION_NOT_WORK_REGION    0.0
REG_CITY_NOT_LIVE_CITY         0.0
REG_CITY_NOT_WORK_CITY         0.0
LIVE_CITY_NOT_WORK_CITY        0.0
ORGANIZATION_TYPE              0.0
EXT_SOURCE_2                   0.0
EXT_SOURCE_3                   0.0
OBS_30_CNT_SOCIAL_CIRCLE       0.0
DEF_30_CNT_SOCIAL_CIRCLE       0.0
OBS_60_CNT_SOCIAL_CIRCLE       0.0
DEF_60_CNT_SOCIAL_CIRCLE       0.0
DAYS_LAST_PHONE_CHANGE         0.0
FLAG_DOCUMENT_2                0.0
FLAG_DOCUMENT_3                0.0
FLAG_DOCUMENT_4                0.0
FLAG_DOCUMENT_5                0.0
FLAG_DOCUMENT_6                0.0
FLAG_DOCUMENT_7                0.0
FLAG_DOCUMENT_8                0.0
FLAG_DOCUMENT_9                0.0
FLAG_DOCUMENT_10               0.0
FLAG_DOCUMENT_11               0.0
FLAG_DOCUMENT_12               0.0
FLAG_DOCUMENT_13               0.0
FLAG_DOCUMENT_14               0.0
FLAG_DOCUMENT_15               0.0
FLAG_DOCUMENT_16               0.0
FLAG_DOCUMENT_17               0.0
FLAG_DOCUMENT_18               0.0
FLAG_DOCUMENT_19               0.0
FLAG_DOCUMENT_20               0.0
FLAG_DOCUMENT_21               0.0
AMT_REQ_CREDIT_BUREAU_HOUR     0.0
AMT_REQ_CREDIT_BUREAU_DAY      0.0
AMT_REQ_CREDIT_BUREAU_WEEK     0.0
AMT_REQ_CREDIT_BUREAU_MON      0.0
AMT_REQ_CREDIT_BUREAU_QRT      0.0
AMT_REQ_CREDIT_BUREAU_YEAR     0.0
YEARS_BIRTH                    0.0
YEARS_EMPLOYED                 0.0
YEARS_REGISTRATION             0.0
YEARS_ID_PUBLISH               0.0
YEARS_LAST_PHONE_CHANGE        0.0
dtype: float64
In [28]:
app_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 76 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   SK_ID_CURR                   307511 non-null  int64  
 1   TARGET                       307511 non-null  int64  
 2   NAME_CONTRACT_TYPE           307511 non-null  object 
 3   CODE_GENDER                  307511 non-null  object 
 4   FLAG_OWN_CAR                 307511 non-null  object 
 5   FLAG_OWN_REALTY              307511 non-null  object 
 6   CNT_CHILDREN                 307511 non-null  int64  
 7   AMT_INCOME_TOTAL             307511 non-null  float64
 8   AMT_CREDIT                   307511 non-null  float64
 9   AMT_ANNUITY                  307511 non-null  float64
 10  AMT_GOODS_PRICE              307511 non-null  float64
 11  NAME_INCOME_TYPE             307511 non-null  object 
 12  NAME_EDUCATION_TYPE          307511 non-null  object 
 13  NAME_FAMILY_STATUS           307511 non-null  object 
 14  NAME_HOUSING_TYPE            307511 non-null  object 
 15  REGION_POPULATION_RELATIVE   307511 non-null  float64
 16  DAYS_BIRTH                   307511 non-null  int64  
 17  DAYS_EMPLOYED                307511 non-null  int64  
 18  DAYS_REGISTRATION            307511 non-null  float64
 19  DAYS_ID_PUBLISH              307511 non-null  int64  
 20  FLAG_MOBIL                   307511 non-null  int64  
 21  FLAG_EMP_PHONE               307511 non-null  int64  
 22  FLAG_WORK_PHONE              307511 non-null  int64  
 23  FLAG_CONT_MOBILE             307511 non-null  int64  
 24  FLAG_PHONE                   307511 non-null  int64  
 25  FLAG_EMAIL                   307511 non-null  int64  
 26  CNT_FAM_MEMBERS              307511 non-null  float64
 27  REGION_RATING_CLIENT         307511 non-null  int64  
 28  REGION_RATING_CLIENT_W_CITY  307511 non-null  int64  
 29  WEEKDAY_APPR_PROCESS_START   307511 non-null  object 
 30  HOUR_APPR_PROCESS_START      307511 non-null  int64  
 31  REG_REGION_NOT_LIVE_REGION   307511 non-null  int64  
 32  REG_REGION_NOT_WORK_REGION   307511 non-null  int64  
 33  LIVE_REGION_NOT_WORK_REGION  307511 non-null  int64  
 34  REG_CITY_NOT_LIVE_CITY       307511 non-null  int64  
 35  REG_CITY_NOT_WORK_CITY       307511 non-null  int64  
 36  LIVE_CITY_NOT_WORK_CITY      307511 non-null  int64  
 37  ORGANIZATION_TYPE            307511 non-null  object 
 38  EXT_SOURCE_2                 307511 non-null  float64
 39  EXT_SOURCE_3                 307511 non-null  float64
 40  OBS_30_CNT_SOCIAL_CIRCLE     307511 non-null  float64
 41  DEF_30_CNT_SOCIAL_CIRCLE     307511 non-null  float64
 42  OBS_60_CNT_SOCIAL_CIRCLE     307511 non-null  float64
 43  DEF_60_CNT_SOCIAL_CIRCLE     307511 non-null  float64
 44  DAYS_LAST_PHONE_CHANGE       307511 non-null  float64
 45  FLAG_DOCUMENT_2              307511 non-null  int64  
 46  FLAG_DOCUMENT_3              307511 non-null  int64  
 47  FLAG_DOCUMENT_4              307511 non-null  int64  
 48  FLAG_DOCUMENT_5              307511 non-null  int64  
 49  FLAG_DOCUMENT_6              307511 non-null  int64  
 50  FLAG_DOCUMENT_7              307511 non-null  int64  
 51  FLAG_DOCUMENT_8              307511 non-null  int64  
 52  FLAG_DOCUMENT_9              307511 non-null  int64  
 53  FLAG_DOCUMENT_10             307511 non-null  int64  
 54  FLAG_DOCUMENT_11             307511 non-null  int64  
 55  FLAG_DOCUMENT_12             307511 non-null  int64  
 56  FLAG_DOCUMENT_13             307511 non-null  int64  
 57  FLAG_DOCUMENT_14             307511 non-null  int64  
 58  FLAG_DOCUMENT_15             307511 non-null  int64  
 59  FLAG_DOCUMENT_16             307511 non-null  int64  
 60  FLAG_DOCUMENT_17             307511 non-null  int64  
 61  FLAG_DOCUMENT_18             307511 non-null  int64  
 62  FLAG_DOCUMENT_19             307511 non-null  int64  
 63  FLAG_DOCUMENT_20             307511 non-null  int64  
 64  FLAG_DOCUMENT_21             307511 non-null  int64  
 65  AMT_REQ_CREDIT_BUREAU_HOUR   307511 non-null  float64
 66  AMT_REQ_CREDIT_BUREAU_DAY    307511 non-null  float64
 67  AMT_REQ_CREDIT_BUREAU_WEEK   307511 non-null  float64
 68  AMT_REQ_CREDIT_BUREAU_MON    307511 non-null  float64
 69  AMT_REQ_CREDIT_BUREAU_QRT    307511 non-null  float64
 70  AMT_REQ_CREDIT_BUREAU_YEAR   307511 non-null  float64
 71  YEARS_BIRTH                  307511 non-null  int64  
 72  YEARS_EMPLOYED               307511 non-null  int64  
 73  YEARS_REGISTRATION           307511 non-null  float64
 74  YEARS_ID_PUBLISH             307511 non-null  int64  
 75  YEARS_LAST_PHONE_CHANGE      307511 non-null  float64
dtypes: float64(22), int64(44), object(10)
memory usage: 178.3+ MB
In [34]:
app_df.EXT_SOURCE_3.isnull().mean()*100
Out[34]:
0.0
In [33]:
app_df.EXT_SOURCE_3.value_counts(normalize =True)*100
Out[33]:
EXT_SOURCE_3
0.535276    20.080908
0.746300     0.474780
0.713631     0.427627
0.694093     0.414945
0.670652     0.387303
              ...    
0.021492     0.000325
0.019468     0.000325
0.023062     0.000325
0.014556     0.000325
0.043227     0.000325
Name: proportion, Length: 814, dtype: float64
In [36]:
app_df.EXT_SOURCE_3.describe()
Out[36]:
count    307511.000000
mean          0.515695
std           0.174736
min           0.000527
25%           0.417100
50%           0.535276
75%           0.636376
max           0.896010
Name: EXT_SOURCE_3, dtype: float64
In [32]:
for col in app_data.select_dtypes(include = [np.number]).columns:
    median = app_data[col].median()
    app_data[col].fillna(median, inplace = True)   
In [37]:
sns.boxplot(app_df.EXT_SOURCE_3)
plt.show()
No description has been provided for this image
In [38]:
app_df.EXT_SOURCE_3.fillna(app_df.EXT_SOURCE_3.median(),inplace =True)
In [39]:
app_df.EXT_SOURCE_3.isnull().mean()*100
Out[39]:
0.0
In [40]:
app_df.EXT_SOURCE_3.value_counts(normalize =True)*100
Out[40]:
EXT_SOURCE_3
0.535276    20.080908
0.746300     0.474780
0.713631     0.427627
0.694093     0.414945
0.670652     0.387303
              ...    
0.021492     0.000325
0.019468     0.000325
0.023062     0.000325
0.014556     0.000325
0.043227     0.000325
Name: proportion, Length: 814, dtype: float64
In [41]:
null_cols = list(app_df.isna().any())
len(null_cols)
Out[41]:
76
In [43]:
app_df.isnull().mean()*100
Out[43]:
SK_ID_CURR                     0.0
TARGET                         0.0
NAME_CONTRACT_TYPE             0.0
CODE_GENDER                    0.0
FLAG_OWN_CAR                   0.0
FLAG_OWN_REALTY                0.0
CNT_CHILDREN                   0.0
AMT_INCOME_TOTAL               0.0
AMT_CREDIT                     0.0
AMT_ANNUITY                    0.0
AMT_GOODS_PRICE                0.0
NAME_INCOME_TYPE               0.0
NAME_EDUCATION_TYPE            0.0
NAME_FAMILY_STATUS             0.0
NAME_HOUSING_TYPE              0.0
REGION_POPULATION_RELATIVE     0.0
DAYS_BIRTH                     0.0
DAYS_EMPLOYED                  0.0
DAYS_REGISTRATION              0.0
DAYS_ID_PUBLISH                0.0
FLAG_MOBIL                     0.0
FLAG_EMP_PHONE                 0.0
FLAG_WORK_PHONE                0.0
FLAG_CONT_MOBILE               0.0
FLAG_PHONE                     0.0
FLAG_EMAIL                     0.0
CNT_FAM_MEMBERS                0.0
REGION_RATING_CLIENT           0.0
REGION_RATING_CLIENT_W_CITY    0.0
WEEKDAY_APPR_PROCESS_START     0.0
HOUR_APPR_PROCESS_START        0.0
REG_REGION_NOT_LIVE_REGION     0.0
REG_REGION_NOT_WORK_REGION     0.0
LIVE_REGION_NOT_WORK_REGION    0.0
REG_CITY_NOT_LIVE_CITY         0.0
REG_CITY_NOT_WORK_CITY         0.0
LIVE_CITY_NOT_WORK_CITY        0.0
ORGANIZATION_TYPE              0.0
EXT_SOURCE_2                   0.0
EXT_SOURCE_3                   0.0
OBS_30_CNT_SOCIAL_CIRCLE       0.0
DEF_30_CNT_SOCIAL_CIRCLE       0.0
OBS_60_CNT_SOCIAL_CIRCLE       0.0
DEF_60_CNT_SOCIAL_CIRCLE       0.0
DAYS_LAST_PHONE_CHANGE         0.0
FLAG_DOCUMENT_2                0.0
FLAG_DOCUMENT_3                0.0
FLAG_DOCUMENT_4                0.0
FLAG_DOCUMENT_5                0.0
FLAG_DOCUMENT_6                0.0
FLAG_DOCUMENT_7                0.0
FLAG_DOCUMENT_8                0.0
FLAG_DOCUMENT_9                0.0
FLAG_DOCUMENT_10               0.0
FLAG_DOCUMENT_11               0.0
FLAG_DOCUMENT_12               0.0
FLAG_DOCUMENT_13               0.0
FLAG_DOCUMENT_14               0.0
FLAG_DOCUMENT_15               0.0
FLAG_DOCUMENT_16               0.0
FLAG_DOCUMENT_17               0.0
FLAG_DOCUMENT_18               0.0
FLAG_DOCUMENT_19               0.0
FLAG_DOCUMENT_20               0.0
FLAG_DOCUMENT_21               0.0
AMT_REQ_CREDIT_BUREAU_HOUR     0.0
AMT_REQ_CREDIT_BUREAU_DAY      0.0
AMT_REQ_CREDIT_BUREAU_WEEK     0.0
AMT_REQ_CREDIT_BUREAU_MON      0.0
AMT_REQ_CREDIT_BUREAU_QRT      0.0
AMT_REQ_CREDIT_BUREAU_YEAR     0.0
YEARS_BIRTH                    0.0
YEARS_EMPLOYED                 0.0
YEARS_REGISTRATION             0.0
YEARS_ID_PUBLISH               0.0
YEARS_LAST_PHONE_CHANGE        0.0
dtype: float64
In [45]:
app_df.AMT_REQ_CREDIT_BUREAU_DAY.value_counts(normalize = True)*100
Out[45]:
AMT_REQ_CREDIT_BUREAU_DAY
0.0    99.515790
1.0     0.420148
2.0     0.034470
3.0     0.014634
4.0     0.008455
5.0     0.002927
6.0     0.002602
9.0     0.000650
8.0     0.000325
Name: proportion, dtype: float64
In [47]:
cols = ['AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT','AMT_REQ_CREDIT_BUREAU_YEAR']
In [49]:
for col in cols:
    app_df[col].fillna(app_df[col].mode()[0],inplace =True)
In [ ]:
app_df.EXT_SOURCE_2.fillna(app_dF.EXT_SOURCE_2.medain(),inplace =True)
In [10]:
for col in app_data.select_dtypes(include = [np.number]).columns:
    app_data[col] = app_data[col].abs()
In [11]:
days_columns = ['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE']
for col in days_columns:
    new_col = col.replace('DAYS', 'YEARS')
    app_data[new_col] = app_data[col].apply(lambda x: x // 365)
In [12]:
print(app_data.isnull().sum())
SK_ID_CURR                 0
TARGET                     0
NAME_CONTRACT_TYPE         0
CODE_GENDER                0
FLAG_OWN_CAR               0
                          ..
YEARS_BIRTH                0
YEARS_EMPLOYED             0
YEARS_REGISTRATION         0
YEARS_ID_PUBLISH           0
YEARS_LAST_PHONE_CHANGE    0
Length: 78, dtype: int64
In [13]:
bins = [0, 200000, 400000, 600000, 800000, 1000000]
labels = ['Very Low Credit', 'Low Credit', 'Medium Credit', 'High Credit', 'Very High Credit']
app_data['AMT_CREDIT_CATEGORY'] = pd.cut(app_data['AMT_CREDIT'], bins=bins,labels=labels)
In [14]:
sns.countplot(data = app_data, x='AMT_CREDIT_CATEGORY')
plt.show()
No description has been provided for this image
In [15]:
app_data.OCCUPATION_TYPE.isnull().mean() * 100
Out[15]:
31.345545362604916
In [16]:
app_data.OCCUPATION_TYPE.value_counts(normalize = True) * 100
Out[16]:
OCCUPATION_TYPE
Laborers                 26.139636
Sales staff              15.205570
Core staff               13.058924
Managers                 10.122679
Drivers                   8.811576
High skill tech staff     5.390299
Accountants               4.648067
Medicine staff            4.043672
Security staff            3.183498
Cooking staff             2.816408
Cleaning staff            2.203960
Private service staff     1.256158
Low-skill Laborers        0.991379
Waiters/barmen staff      0.638499
Secretaries               0.618132
Realty agents             0.355722
HR staff                  0.266673
IT staff                  0.249147
Name: proportion, dtype: float64
In [26]:
categorical_cols = []
numerical_cols = []

for col in app_data.columns:
    if app_data[col].dtype == 'object' or app_data[col].nunique() < 10:
        categorical_cols.append(col)
    else:
        numerical_cols.append(col)

print(f"Categorical columns: {len(categorical_cols)}")
print(f"Numerical columns: {len(numerical_cols)}")
Categorical columns: 55
Numerical columns: 67
In [29]:
def plot_categorical_analysis(df, categorical_cols, target_col='TARGET'):
    plt.figure(figsize=(15, 20))
    
    for i, col in enumerate(categorical_cols[:12]):  # Limit to first 12
        plt.subplot(4, 3, i+1)
        
        # Count plot
        df[col].value_counts().plot(kind='bar')
        plt.title(f'Distribution of {col}')
        plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
plot_categorical_analysis(app_data, categorical_cols)
No description has been provided for this image
In [30]:
def plot_numerical_analysis(df, numerical_cols):
    plt.figure(figsize=(15, 20))
    
    for i, col in enumerate(numerical_cols[:12]):  # Limit to first 12
        plt.subplot(4, 3, i+1)
        
        # Histogram
        df[col].hist(bins=30, alpha=0.7)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

plot_numerical_analysis(app_data, numerical_cols)
No description has been provided for this image
In [32]:
## Univariate Analysis with Target Variable
def plot_categorical_target_analysis(df, categorical_cols, target_col='TARGET'):
    plt.figure(figsize=(15, 20))
    
    for i, col in enumerate(categorical_cols[:8]):
        plt.subplot(4, 2, i+1)
        
        # Crosstab with percentages
        ct = pd.crosstab(df[col], df[target_col], normalize='index') * 100
        ct.plot(kind='bar', stacked=True)
        plt.title(f'{col} vs Target')
        plt.xticks(rotation=45)
        plt.legend(['No Difficulty', 'Difficulty'])
    
    plt.tight_layout()
    plt.show()

plot_categorical_target_analysis(app_data, categorical_cols)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [34]:
## Bivariate Analysis

### Correlation analysis

# Correlation matrix for numerical variables
numerical_data = app_data[numerical_cols].select_dtypes(include=[np.number])

plt.figure(figsize=(12, 10))
correlation_matrix = numerical_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix of Numerical Variables')
plt.show()

# Find highly correlated pairs
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_val = correlation_matrix.iloc[i, j]
        if abs(corr_val) > 0.7:  # High correlation threshold
            high_corr_pairs.append((correlation_matrix.columns[i], 
                                  correlation_matrix.columns[j], 
                                  corr_val))

print("Highly correlated pairs:")
for pair in high_corr_pairs:
    print(f"{pair[0]} - {pair[1]}: {pair[2]:.3f}")
No description has been provided for this image
Highly correlated pairs:
CNT_CHILDREN - CNT_FAM_MEMBERS: 0.879
AMT_CREDIT - AMT_ANNUITY: 0.770
AMT_CREDIT - AMT_GOODS_PRICE: 0.987
AMT_ANNUITY - AMT_GOODS_PRICE: 0.775
APARTMENTS_AVG - ELEVATORS_AVG: 0.837
APARTMENTS_AVG - LIVINGAPARTMENTS_AVG: 0.944
APARTMENTS_AVG - LIVINGAREA_AVG: 0.914
APARTMENTS_AVG - APARTMENTS_MODE: 0.973
APARTMENTS_AVG - ELEVATORS_MODE: 0.823
APARTMENTS_AVG - LIVINGAPARTMENTS_MODE: 0.931
APARTMENTS_AVG - LIVINGAREA_MODE: 0.893
APARTMENTS_AVG - APARTMENTS_MEDI: 0.995
APARTMENTS_AVG - ELEVATORS_MEDI: 0.835
APARTMENTS_AVG - LIVINGAPARTMENTS_MEDI: 0.942
APARTMENTS_AVG - LIVINGAREA_MEDI: 0.912
APARTMENTS_AVG - TOTALAREA_MODE: 0.893
BASEMENTAREA_AVG - BASEMENTAREA_MODE: 0.973
BASEMENTAREA_AVG - BASEMENTAREA_MEDI: 0.994
YEARS_BEGINEXPLUATATION_AVG - YEARS_BEGINEXPLUATATION_MODE: 0.972
YEARS_BEGINEXPLUATATION_AVG - YEARS_BEGINEXPLUATATION_MEDI: 0.994
YEARS_BUILD_AVG - YEARS_BUILD_MODE: 0.989
YEARS_BUILD_AVG - YEARS_BUILD_MEDI: 0.998
COMMONAREA_AVG - COMMONAREA_MODE: 0.977
COMMONAREA_AVG - COMMONAREA_MEDI: 0.996
ELEVATORS_AVG - LIVINGAPARTMENTS_AVG: 0.812
ELEVATORS_AVG - LIVINGAREA_AVG: 0.868
ELEVATORS_AVG - APARTMENTS_MODE: 0.806
ELEVATORS_AVG - ELEVATORS_MODE: 0.979
ELEVATORS_AVG - LIVINGAPARTMENTS_MODE: 0.797
ELEVATORS_AVG - LIVINGAREA_MODE: 0.839
ELEVATORS_AVG - APARTMENTS_MEDI: 0.835
ELEVATORS_AVG - ELEVATORS_MEDI: 0.996
ELEVATORS_AVG - LIVINGAPARTMENTS_MEDI: 0.813
ELEVATORS_AVG - LIVINGAREA_MEDI: 0.866
ELEVATORS_AVG - TOTALAREA_MODE: 0.845
ENTRANCES_AVG - ENTRANCES_MODE: 0.978
ENTRANCES_AVG - ENTRANCES_MEDI: 0.997
FLOORSMAX_AVG - FLOORSMIN_AVG: 0.742
FLOORSMAX_AVG - FLOORSMAX_MODE: 0.986
FLOORSMAX_AVG - FLOORSMIN_MODE: 0.722
FLOORSMAX_AVG - FLOORSMAX_MEDI: 0.997
FLOORSMAX_AVG - FLOORSMIN_MEDI: 0.740
FLOORSMIN_AVG - FLOORSMAX_MODE: 0.729
FLOORSMIN_AVG - FLOORSMIN_MODE: 0.986
FLOORSMIN_AVG - FLOORSMAX_MEDI: 0.740
FLOORSMIN_AVG - FLOORSMIN_MEDI: 0.997
LANDAREA_AVG - LANDAREA_MODE: 0.974
LANDAREA_AVG - LANDAREA_MEDI: 0.992
LIVINGAPARTMENTS_AVG - LIVINGAREA_AVG: 0.881
LIVINGAPARTMENTS_AVG - APARTMENTS_MODE: 0.908
LIVINGAPARTMENTS_AVG - ELEVATORS_MODE: 0.794
LIVINGAPARTMENTS_AVG - LIVINGAPARTMENTS_MODE: 0.970
LIVINGAPARTMENTS_AVG - LIVINGAREA_MODE: 0.852
LIVINGAPARTMENTS_AVG - APARTMENTS_MEDI: 0.936
LIVINGAPARTMENTS_AVG - ELEVATORS_MEDI: 0.809
LIVINGAPARTMENTS_AVG - LIVINGAPARTMENTS_MEDI: 0.994
LIVINGAPARTMENTS_AVG - LIVINGAREA_MEDI: 0.878
LIVINGAPARTMENTS_AVG - TOTALAREA_MODE: 0.848
LIVINGAREA_AVG - APARTMENTS_MODE: 0.891
LIVINGAREA_AVG - ELEVATORS_MODE: 0.853
LIVINGAREA_AVG - LIVINGAPARTMENTS_MODE: 0.873
LIVINGAREA_AVG - LIVINGAREA_MODE: 0.972
LIVINGAREA_AVG - APARTMENTS_MEDI: 0.913
LIVINGAREA_AVG - ELEVATORS_MEDI: 0.866
LIVINGAREA_AVG - LIVINGAPARTMENTS_MEDI: 0.883
LIVINGAREA_AVG - LIVINGAREA_MEDI: 0.996
LIVINGAREA_AVG - TOTALAREA_MODE: 0.925
NONLIVINGAPARTMENTS_AVG - NONLIVINGAPARTMENTS_MODE: 0.969
NONLIVINGAPARTMENTS_AVG - NONLIVINGAPARTMENTS_MEDI: 0.991
NONLIVINGAREA_AVG - NONLIVINGAREA_MODE: 0.966
NONLIVINGAREA_AVG - NONLIVINGAREA_MEDI: 0.990
APARTMENTS_MODE - ELEVATORS_MODE: 0.826
APARTMENTS_MODE - LIVINGAPARTMENTS_MODE: 0.938
APARTMENTS_MODE - LIVINGAREA_MODE: 0.910
APARTMENTS_MODE - APARTMENTS_MEDI: 0.977
APARTMENTS_MODE - ELEVATORS_MEDI: 0.809
APARTMENTS_MODE - LIVINGAPARTMENTS_MEDI: 0.915
APARTMENTS_MODE - LIVINGAREA_MEDI: 0.894
APARTMENTS_MODE - TOTALAREA_MODE: 0.864
BASEMENTAREA_MODE - BASEMENTAREA_MEDI: 0.978
YEARS_BEGINEXPLUATATION_MODE - YEARS_BEGINEXPLUATATION_MEDI: 0.964
YEARS_BUILD_MODE - YEARS_BUILD_MEDI: 0.989
COMMONAREA_MODE - COMMONAREA_MEDI: 0.980
ELEVATORS_MODE - LIVINGAPARTMENTS_MODE: 0.808
ELEVATORS_MODE - LIVINGAREA_MODE: 0.856
ELEVATORS_MODE - APARTMENTS_MEDI: 0.826
ELEVATORS_MODE - ELEVATORS_MEDI: 0.983
ELEVATORS_MODE - LIVINGAPARTMENTS_MEDI: 0.799
ELEVATORS_MODE - LIVINGAREA_MEDI: 0.856
ELEVATORS_MODE - TOTALAREA_MODE: 0.821
ENTRANCES_MODE - ENTRANCES_MEDI: 0.981
FLOORSMAX_MODE - FLOORSMIN_MODE: 0.726
FLOORSMAX_MODE - FLOORSMAX_MEDI: 0.988
FLOORSMAX_MODE - FLOORSMIN_MEDI: 0.730
FLOORSMIN_MODE - FLOORSMAX_MEDI: 0.723
FLOORSMIN_MODE - FLOORSMIN_MEDI: 0.988
LANDAREA_MODE - LANDAREA_MEDI: 0.981
LIVINGAPARTMENTS_MODE - LIVINGAREA_MODE: 0.878
LIVINGAPARTMENTS_MODE - APARTMENTS_MEDI: 0.932
LIVINGAPARTMENTS_MODE - ELEVATORS_MEDI: 0.799
LIVINGAPARTMENTS_MODE - LIVINGAPARTMENTS_MEDI: 0.976
LIVINGAPARTMENTS_MODE - LIVINGAREA_MEDI: 0.874
LIVINGAPARTMENTS_MODE - TOTALAREA_MODE: 0.834
LIVINGAREA_MODE - APARTMENTS_MEDI: 0.896
LIVINGAREA_MODE - ELEVATORS_MEDI: 0.841
LIVINGAREA_MODE - LIVINGAPARTMENTS_MEDI: 0.857
LIVINGAREA_MODE - LIVINGAREA_MEDI: 0.975
LIVINGAREA_MODE - TOTALAREA_MODE: 0.899
NONLIVINGAPARTMENTS_MODE - NONLIVINGAPARTMENTS_MEDI: 0.979
NONLIVINGAREA_MODE - NONLIVINGAREA_MEDI: 0.976
APARTMENTS_MEDI - ELEVATORS_MEDI: 0.837
APARTMENTS_MEDI - LIVINGAPARTMENTS_MEDI: 0.942
APARTMENTS_MEDI - LIVINGAREA_MEDI: 0.916
APARTMENTS_MEDI - TOTALAREA_MODE: 0.887
ELEVATORS_MEDI - LIVINGAPARTMENTS_MEDI: 0.814
ELEVATORS_MEDI - LIVINGAREA_MEDI: 0.868
ELEVATORS_MEDI - TOTALAREA_MODE: 0.838
FLOORSMAX_MEDI - FLOORSMIN_MEDI: 0.740
LIVINGAPARTMENTS_MEDI - LIVINGAREA_MEDI: 0.885
LIVINGAPARTMENTS_MEDI - TOTALAREA_MODE: 0.846
LIVINGAREA_MEDI - TOTALAREA_MODE: 0.919
OBS_30_CNT_SOCIAL_CIRCLE - OBS_60_CNT_SOCIAL_CIRCLE: 0.998
In [35]:
### Pair plots for key variables

# Create pair plots for important variables
key_vars = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'YEARS_BIRTH', 'TARGET']
if all(var in app_data.columns for var in key_vars):
    sns.pairplot(app_data[key_vars], hue='TARGET', diag_kind='hist')
    plt.show()
In [36]:
## Bivariate Analysis - Numerical vs Target

# Box plots for numerical variables vs target
def plot_numerical_vs_target(df, numerical_cols, target_col='TARGET'):
    plt.figure(figsize=(15, 20))
    
    for i, col in enumerate(numerical_cols[:12]):
        plt.subplot(4, 3, i+1)
        
        # Box plot
        df.boxplot(column=col, by=target_col, ax=plt.gca())
        plt.title(f'{col} by Target')
        plt.suptitle('')  # Remove default title
    
    plt.tight_layout()
    plt.show()

plot_numerical_vs_target(app_data, numerical_cols[:8])
No description has been provided for this image
In [37]:
## Multivariate Analysis

### Three-way analysis

# Analyze relationships between multiple variables
def multivariate_analysis(df, cat_var1, cat_var2, target_col='TARGET'):
    # Create crosstab
    ct = pd.crosstab([df[cat_var1], df[cat_var2]], df[target_col])
    
    # Calculate percentages
    ct_pct = pd.crosstab([df[cat_var1], df[cat_var2]], df[target_col], 
                        normalize='index') * 100
    
    print(f"Multivariate Analysis: {cat_var1} x {cat_var2} x {target_col}")
    print("Counts:")
    print(ct)
    print("\nPercentages:")
    print(ct_pct)
    
    # Visualization
    plt.figure(figsize=(12, 6))
    ct_pct.plot(kind='bar')
    plt.title(f'{cat_var1} x {cat_var2} vs Target (%)')
    plt.xticks(rotation=45)
    plt.show()

# Example multivariate analysis
if 'NAME_CONTRACT_TYPE' in app_data.columns and 'CODE_GENDER' in app_data.columns:
    multivariate_analysis(app_data, 'NAME_CONTRACT_TYPE', 'CODE_GENDER')
Multivariate Analysis: NAME_CONTRACT_TYPE x CODE_GENDER x TARGET
Counts:
TARGET                               0      1
NAME_CONTRACT_TYPE CODE_GENDER               
Cash loans         F            169673  13127
                   M             85338  10094
Revolving loans    F             18605   1043
                   M              9066    561
                   XNA               4      0

Percentages:
TARGET                                   0          1
NAME_CONTRACT_TYPE CODE_GENDER                       
Cash loans         F             92.818928   7.181072
                   M             89.422835  10.577165
Revolving loans    F             94.691572   5.308428
                   M             94.172639   5.827361
                   XNA          100.000000   0.000000
<Figure size 1200x600 with 0 Axes>
No description has been provided for this image
In [38]:
## Advanced Analysis - Target-wise Correlation
            
# Separate data by target values
target_0 = app_data[app_data['TARGET'] == 0]
target_1 = app_data[app_data['TARGET'] == 1]

# Correlation for non-defaulters (TARGET = 0)
plt.figure(figsize=(10, 8))
corr_0 = target_0[numerical_cols].select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_0, annot=True, cmap='Blues', center=0, fmt='.2f')
plt.title('Correlation Matrix - Non-Defaulters (TARGET = 0)')
plt.show()

# Correlation for defaulters (TARGET = 1)
plt.figure(figsize=(10, 8))
corr_1 = target_1[numerical_cols].select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_1, annot=True, cmap='Reds', center=0, fmt='.2f')
plt.title('Correlation Matrix - Defaulters (TARGET = 1)')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [39]:
## Previous Application Analysis

# Load and analyze previous application data
print("Previous Application Analysis")
print("Shape:", prev_data.shape)

# Check missing values in previous data
prev_missing = (prev_data.isnull().sum() / len(prev_data)) * 100
prev_missing_df = prev_missing[prev_missing > 0].sort_values(ascending=False)
print("Missing values in Previous Application:")
print(prev_missing_df.head(10))

# Similar cleaning process for previous data
# Drop columns with >49% missing values
prev_threshold = 0.49
prev_high_missing = prev_missing_df[prev_missing_df > prev_threshold * 100].index
prev_data_clean = prev_data.drop(columns=prev_high_missing)

print(f"Previous data columns after cleaning: {len(prev_data_clean.columns)}")
Previous Application Analysis
Shape: (1670214, 37)
Missing values in Previous Application:
RATE_INTEREST_PRIMARY        99.643698
RATE_INTEREST_PRIVILEGED     99.643698
AMT_DOWN_PAYMENT             53.636480
RATE_DOWN_PAYMENT            53.636480
NAME_TYPE_SUITE              49.119754
DAYS_FIRST_DRAWING           40.298129
DAYS_FIRST_DUE               40.298129
DAYS_LAST_DUE_1ST_VERSION    40.298129
DAYS_LAST_DUE                40.298129
DAYS_TERMINATION             40.298129
dtype: float64
Previous data columns after cleaning: 32
In [41]:
## Merge Analysis

# Merge application data with previous application data
# Group previous applications by SK_ID_CURR and create aggregated features
prev_agg = prev_data_clean.groupby('SK_ID_CURR').agg({
    'AMT_ANNUITY': ['mean', 'max', 'min'],
    'AMT_APPLICATION': ['mean', 'max', 'min'],
    'AMT_CREDIT': ['mean', 'max', 'min'],
    'AMT_GOODS_PRICE': ['mean', 'max', 'min']
}).reset_index()

# Flatten column names
prev_agg.columns = ['SK_ID_CURR'] + ['_'.join(col).strip() for col in prev_agg.columns[1:]]

# Merge with application data
merged_data = app_data.merge(prev_agg, on='SK_ID_CURR', how='left')
print(f"Merged data shape: {merged_data.shape}")
Merged data shape: (307511, 134)
In [43]:
merged_data = pd.merge(app_data, prev_data, on='SK_ID_CURR', how='left')
# Drop columns starting with 'FLAG'
cols_to_drop = [col for col in merged_data.columns if col.startswith('FLAG')]
merged_data.drop(columns=cols_to_drop,inplace=True)
In [44]:
print(merged_data.head(7))
   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE_x CODE_GENDER  CNT_CHILDREN  \
0      100002       1           Cash loans           M             0   
1      100003       0           Cash loans           F             0   
2      100003       0           Cash loans           F             0   
3      100003       0           Cash loans           F             0   
4      100004       0      Revolving loans           M             0   
5      100006       0           Cash loans           F             0   
6      100006       0           Cash loans           F             0   

   AMT_INCOME_TOTAL  AMT_CREDIT_x  AMT_ANNUITY_x  AMT_GOODS_PRICE_x  \
0          202500.0      406597.5        24700.5           351000.0   
1          270000.0     1293502.5        35698.5          1129500.0   
2          270000.0     1293502.5        35698.5          1129500.0   
3          270000.0     1293502.5        35698.5          1129500.0   
4           67500.0      135000.0         6750.0           135000.0   
5          135000.0      312682.5        29686.5           297000.0   
6          135000.0      312682.5        29686.5           297000.0   

  NAME_TYPE_SUITE_x NAME_INCOME_TYPE            NAME_EDUCATION_TYPE  \
0     Unaccompanied          Working  Secondary / secondary special   
1            Family    State servant               Higher education   
2            Family    State servant               Higher education   
3            Family    State servant               Higher education   
4     Unaccompanied          Working  Secondary / secondary special   
5     Unaccompanied          Working  Secondary / secondary special   
6     Unaccompanied          Working  Secondary / secondary special   

     NAME_FAMILY_STATUS  NAME_HOUSING_TYPE  REGION_POPULATION_RELATIVE  \
0  Single / not married  House / apartment                    0.018801   
1               Married  House / apartment                    0.003541   
2               Married  House / apartment                    0.003541   
3               Married  House / apartment                    0.003541   
4  Single / not married  House / apartment                    0.010032   
5        Civil marriage  House / apartment                    0.008019   
6        Civil marriage  House / apartment                    0.008019   

   DAYS_BIRTH  DAYS_EMPLOYED  DAYS_REGISTRATION  DAYS_ID_PUBLISH  OWN_CAR_AGE  \
0       -9461           -637            -3648.0            -2120          NaN   
1      -16765          -1188            -1186.0             -291          NaN   
2      -16765          -1188            -1186.0             -291          NaN   
3      -16765          -1188            -1186.0             -291          NaN   
4      -19046           -225            -4260.0            -2531         26.0   
5      -19005          -3039            -9833.0            -2437          NaN   
6      -19005          -3039            -9833.0            -2437          NaN   

  OCCUPATION_TYPE  CNT_FAM_MEMBERS  REGION_RATING_CLIENT  \
0        Laborers              1.0                     2   
1      Core staff              2.0                     1   
2      Core staff              2.0                     1   
3      Core staff              2.0                     1   
4        Laborers              1.0                     2   
5        Laborers              2.0                     2   
6        Laborers              2.0                     2   

   REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START_x  \
0                            2                    WEDNESDAY   
1                            1                       MONDAY   
2                            1                       MONDAY   
3                            1                       MONDAY   
4                            2                       MONDAY   
5                            2                    WEDNESDAY   
6                            2                    WEDNESDAY   

   HOUR_APPR_PROCESS_START_x  REG_REGION_NOT_LIVE_REGION  \
0                         10                           0   
1                         11                           0   
2                         11                           0   
3                         11                           0   
4                          9                           0   
5                         17                           0   
6                         17                           0   

   REG_REGION_NOT_WORK_REGION  LIVE_REGION_NOT_WORK_REGION  \
0                           0                            0   
1                           0                            0   
2                           0                            0   
3                           0                            0   
4                           0                            0   
5                           0                            0   
6                           0                            0   

   REG_CITY_NOT_LIVE_CITY  REG_CITY_NOT_WORK_CITY  LIVE_CITY_NOT_WORK_CITY  \
0                       0                       0                        0   
1                       0                       0                        0   
2                       0                       0                        0   
3                       0                       0                        0   
4                       0                       0                        0   
5                       0                       0                        0   
6                       0                       0                        0   

        ORGANIZATION_TYPE  EXT_SOURCE_1  EXT_SOURCE_2  EXT_SOURCE_3  \
0  Business Entity Type 3      0.083037      0.262949      0.139376   
1                  School      0.311267      0.622246           NaN   
2                  School      0.311267      0.622246           NaN   
3                  School      0.311267      0.622246           NaN   
4              Government           NaN      0.555912      0.729567   
5  Business Entity Type 3           NaN      0.650442           NaN   
6  Business Entity Type 3           NaN      0.650442           NaN   

   APARTMENTS_AVG  BASEMENTAREA_AVG  YEARS_BEGINEXPLUATATION_AVG  \
0          0.0247            0.0369                       0.9722   
1          0.0959            0.0529                       0.9851   
2          0.0959            0.0529                       0.9851   
3          0.0959            0.0529                       0.9851   
4             NaN               NaN                          NaN   
5             NaN               NaN                          NaN   
6             NaN               NaN                          NaN   

   YEARS_BUILD_AVG  COMMONAREA_AVG  ELEVATORS_AVG  ENTRANCES_AVG  \
0           0.6192          0.0143           0.00         0.0690   
1           0.7960          0.0605           0.08         0.0345   
2           0.7960          0.0605           0.08         0.0345   
3           0.7960          0.0605           0.08         0.0345   
4              NaN             NaN            NaN            NaN   
5              NaN             NaN            NaN            NaN   
6              NaN             NaN            NaN            NaN   

   FLOORSMAX_AVG  FLOORSMIN_AVG  LANDAREA_AVG  LIVINGAPARTMENTS_AVG  \
0         0.0833         0.1250        0.0369                0.0202   
1         0.2917         0.3333        0.0130                0.0773   
2         0.2917         0.3333        0.0130                0.0773   
3         0.2917         0.3333        0.0130                0.0773   
4            NaN            NaN           NaN                   NaN   
5            NaN            NaN           NaN                   NaN   
6            NaN            NaN           NaN                   NaN   

   LIVINGAREA_AVG  NONLIVINGAPARTMENTS_AVG  NONLIVINGAREA_AVG  \
0          0.0190                   0.0000             0.0000   
1          0.0549                   0.0039             0.0098   
2          0.0549                   0.0039             0.0098   
3          0.0549                   0.0039             0.0098   
4             NaN                      NaN                NaN   
5             NaN                      NaN                NaN   
6             NaN                      NaN                NaN   

   APARTMENTS_MODE  BASEMENTAREA_MODE  YEARS_BEGINEXPLUATATION_MODE  \
0           0.0252             0.0383                        0.9722   
1           0.0924             0.0538                        0.9851   
2           0.0924             0.0538                        0.9851   
3           0.0924             0.0538                        0.9851   
4              NaN                NaN                           NaN   
5              NaN                NaN                           NaN   
6              NaN                NaN                           NaN   

   YEARS_BUILD_MODE  COMMONAREA_MODE  ELEVATORS_MODE  ENTRANCES_MODE  \
0            0.6341           0.0144          0.0000          0.0690   
1            0.8040           0.0497          0.0806          0.0345   
2            0.8040           0.0497          0.0806          0.0345   
3            0.8040           0.0497          0.0806          0.0345   
4               NaN              NaN             NaN             NaN   
5               NaN              NaN             NaN             NaN   
6               NaN              NaN             NaN             NaN   

   FLOORSMAX_MODE  FLOORSMIN_MODE  LANDAREA_MODE  LIVINGAPARTMENTS_MODE  \
0          0.0833          0.1250         0.0377                  0.022   
1          0.2917          0.3333         0.0128                  0.079   
2          0.2917          0.3333         0.0128                  0.079   
3          0.2917          0.3333         0.0128                  0.079   
4             NaN             NaN            NaN                    NaN   
5             NaN             NaN            NaN                    NaN   
6             NaN             NaN            NaN                    NaN   

   LIVINGAREA_MODE  NONLIVINGAPARTMENTS_MODE  NONLIVINGAREA_MODE  \
0           0.0198                       0.0                 0.0   
1           0.0554                       0.0                 0.0   
2           0.0554                       0.0                 0.0   
3           0.0554                       0.0                 0.0   
4              NaN                       NaN                 NaN   
5              NaN                       NaN                 NaN   
6              NaN                       NaN                 NaN   

   APARTMENTS_MEDI  BASEMENTAREA_MEDI  YEARS_BEGINEXPLUATATION_MEDI  \
0           0.0250             0.0369                        0.9722   
1           0.0968             0.0529                        0.9851   
2           0.0968             0.0529                        0.9851   
3           0.0968             0.0529                        0.9851   
4              NaN                NaN                           NaN   
5              NaN                NaN                           NaN   
6              NaN                NaN                           NaN   

   YEARS_BUILD_MEDI  COMMONAREA_MEDI  ELEVATORS_MEDI  ENTRANCES_MEDI  \
0            0.6243           0.0144            0.00          0.0690   
1            0.7987           0.0608            0.08          0.0345   
2            0.7987           0.0608            0.08          0.0345   
3            0.7987           0.0608            0.08          0.0345   
4               NaN              NaN             NaN             NaN   
5               NaN              NaN             NaN             NaN   
6               NaN              NaN             NaN             NaN   

   FLOORSMAX_MEDI  FLOORSMIN_MEDI  LANDAREA_MEDI  LIVINGAPARTMENTS_MEDI  \
0          0.0833          0.1250         0.0375                 0.0205   
1          0.2917          0.3333         0.0132                 0.0787   
2          0.2917          0.3333         0.0132                 0.0787   
3          0.2917          0.3333         0.0132                 0.0787   
4             NaN             NaN            NaN                    NaN   
5             NaN             NaN            NaN                    NaN   
6             NaN             NaN            NaN                    NaN   

   LIVINGAREA_MEDI  NONLIVINGAPARTMENTS_MEDI  NONLIVINGAREA_MEDI  \
0           0.0193                    0.0000                0.00   
1           0.0558                    0.0039                0.01   
2           0.0558                    0.0039                0.01   
3           0.0558                    0.0039                0.01   
4              NaN                       NaN                 NaN   
5              NaN                       NaN                 NaN   
6              NaN                       NaN                 NaN   

  FONDKAPREMONT_MODE  HOUSETYPE_MODE  TOTALAREA_MODE WALLSMATERIAL_MODE  \
0   reg oper account  block of flats          0.0149       Stone, brick   
1   reg oper account  block of flats          0.0714              Block   
2   reg oper account  block of flats          0.0714              Block   
3   reg oper account  block of flats          0.0714              Block   
4                NaN             NaN             NaN                NaN   
5                NaN             NaN             NaN                NaN   
6                NaN             NaN             NaN                NaN   

  EMERGENCYSTATE_MODE  OBS_30_CNT_SOCIAL_CIRCLE  DEF_30_CNT_SOCIAL_CIRCLE  \
0                  No                       2.0                       2.0   
1                  No                       1.0                       0.0   
2                  No                       1.0                       0.0   
3                  No                       1.0                       0.0   
4                 NaN                       0.0                       0.0   
5                 NaN                       2.0                       0.0   
6                 NaN                       2.0                       0.0   

   OBS_60_CNT_SOCIAL_CIRCLE  DEF_60_CNT_SOCIAL_CIRCLE  DAYS_LAST_PHONE_CHANGE  \
0                       2.0                       2.0                 -1134.0   
1                       1.0                       0.0                  -828.0   
2                       1.0                       0.0                  -828.0   
3                       1.0                       0.0                  -828.0   
4                       0.0                       0.0                  -815.0   
5                       2.0                       0.0                  -617.0   
6                       2.0                       0.0                  -617.0   

   AMT_REQ_CREDIT_BUREAU_HOUR  AMT_REQ_CREDIT_BUREAU_DAY  \
0                         0.0                        0.0   
1                         0.0                        0.0   
2                         0.0                        0.0   
3                         0.0                        0.0   
4                         0.0                        0.0   
5                         NaN                        NaN   
6                         NaN                        NaN   

   AMT_REQ_CREDIT_BUREAU_WEEK  AMT_REQ_CREDIT_BUREAU_MON  \
0                         0.0                        0.0   
1                         0.0                        0.0   
2                         0.0                        0.0   
3                         0.0                        0.0   
4                         0.0                        0.0   
5                         NaN                        NaN   
6                         NaN                        NaN   

   AMT_REQ_CREDIT_BUREAU_QRT  AMT_REQ_CREDIT_BUREAU_YEAR  SK_ID_PREV  \
0                        0.0                         1.0   1038818.0   
1                        0.0                         0.0   1810518.0   
2                        0.0                         0.0   2636178.0   
3                        0.0                         0.0   2396755.0   
4                        0.0                         0.0   1564014.0   
5                        NaN                         NaN   2078043.0   
6                        NaN                         NaN   2827850.0   

  NAME_CONTRACT_TYPE_y  AMT_ANNUITY_y  AMT_APPLICATION  AMT_CREDIT_y  \
0       Consumer loans       9251.775         179055.0      179055.0   
1           Cash loans      98356.995         900000.0     1035882.0   
2       Consumer loans      64567.665         337500.0      348637.5   
3       Consumer loans       6737.310          68809.5       68053.5   
4       Consumer loans       5357.250          24282.0       20106.0   
5           Cash loans      24246.000         675000.0      675000.0   
6      Revolving loans            NaN              0.0           0.0   

   AMT_DOWN_PAYMENT  AMT_GOODS_PRICE_y WEEKDAY_APPR_PROCESS_START_y  \
0               0.0           179055.0                     SATURDAY   
1               NaN           900000.0                       FRIDAY   
2               0.0           337500.0                       SUNDAY   
3            6885.0            68809.5                     SATURDAY   
4            4860.0            24282.0                       FRIDAY   
5               NaN           675000.0                     THURSDAY   
6               NaN                NaN                     THURSDAY   

   HOUR_APPR_PROCESS_START_y  NFLAG_LAST_APPL_IN_DAY  RATE_DOWN_PAYMENT  \
0                        9.0                     1.0           0.000000   
1                       12.0                     1.0                NaN   
2                       17.0                     1.0           0.000000   
3                       15.0                     1.0           0.100061   
4                        5.0                     1.0           0.212008   
5                       15.0                     1.0                NaN   
6                       15.0                     1.0                NaN   

   RATE_INTEREST_PRIMARY  RATE_INTEREST_PRIVILEGED NAME_CASH_LOAN_PURPOSE  \
0                    NaN                       NaN                    XAP   
1                    NaN                       NaN                    XNA   
2                    NaN                       NaN                    XAP   
3                    NaN                       NaN                    XAP   
4                    NaN                       NaN                    XAP   
5                    NaN                       NaN                    XNA   
6                    NaN                       NaN                    XAP   

  NAME_CONTRACT_STATUS  DAYS_DECISION      NAME_PAYMENT_TYPE  \
0             Approved         -606.0                    XNA   
1             Approved         -746.0                    XNA   
2             Approved         -828.0  Cash through the bank   
3             Approved        -2341.0  Cash through the bank   
4             Approved         -815.0  Cash through the bank   
5             Approved         -181.0  Cash through the bank   
6             Canceled         -181.0                    XNA   

  CODE_REJECT_REASON NAME_TYPE_SUITE_y NAME_CLIENT_TYPE   NAME_GOODS_CATEGORY  \
0                XAP               NaN              New              Vehicles   
1                XAP     Unaccompanied         Repeater                   XNA   
2                XAP            Family        Refreshed             Furniture   
3                XAP            Family        Refreshed  Consumer Electronics   
4                XAP     Unaccompanied              New                Mobile   
5                XAP     Unaccompanied         Repeater                   XNA   
6                XAP               NaN         Repeater                   XNA   

  NAME_PORTFOLIO NAME_PRODUCT_TYPE             CHANNEL_TYPE  SELLERPLACE_AREA  \
0            POS               XNA                    Stone             500.0   
1           Cash            x-sell  Credit and cash offices              -1.0   
2            POS               XNA                    Stone            1400.0   
3            POS               XNA             Country-wide             200.0   
4            POS               XNA         Regional / Local              30.0   
5           Cash            x-sell  Credit and cash offices              -1.0   
6            XNA               XNA  Credit and cash offices              -1.0   

   NAME_SELLER_INDUSTRY  CNT_PAYMENT NAME_YIELD_GROUP  \
0       Auto technology         24.0       low_normal   
1                   XNA         12.0       low_normal   
2             Furniture          6.0           middle   
3  Consumer electronics         12.0           middle   
4          Connectivity          4.0           middle   
5                   XNA         48.0       low_normal   
6                   XNA          NaN              XNA   

           PRODUCT_COMBINATION  DAYS_FIRST_DRAWING  DAYS_FIRST_DUE  \
0      POS other with interest            365243.0          -565.0   
1             Cash X-Sell: low            365243.0          -716.0   
2   POS industry with interest            365243.0          -797.0   
3  POS household with interest            365243.0         -2310.0   
4  POS mobile without interest            365243.0          -784.0   
5             Cash X-Sell: low            365243.0          -151.0   
6                  Card Street                 NaN             NaN   

   DAYS_LAST_DUE_1ST_VERSION  DAYS_LAST_DUE  DAYS_TERMINATION  \
0                      125.0          -25.0             -17.0   
1                     -386.0         -536.0            -527.0   
2                     -647.0         -647.0            -639.0   
3                    -1980.0        -1980.0           -1976.0   
4                     -694.0         -724.0            -714.0   
5                     1259.0         -151.0            -143.0   
6                        NaN            NaN               NaN   

   NFLAG_INSURED_ON_APPROVAL  
0                        0.0  
1                        1.0  
2                        0.0  
3                        1.0  
4                        0.0  
5                        0.0  
6                        NaN  
In [46]:
## Key Insights and Conclusions

# Generate summary insights
def generate_insights(df, target_col='TARGET'):
    insights = []
    
    # Target distribution
    target_dist = df[target_col].value_counts(normalize=True) * 100
    insights.append(f"Target Distribution: {target_dist[0]:.1f}% Non-defaulters, {target_dist[1]:.1f}% Defaulters")
    
    # Gender analysis
    if 'CODE_GENDER' in df.columns:
        gender_default = pd.crosstab(df['CODE_GENDER'], df[target_col], normalize='index') * 100
        insights.append(f"Gender Analysis: Female default rate: {gender_default.loc['F', 1]:.1f}%, Male default rate: {gender_default.loc['M', 1]:.1f}%")
    
    # Age group analysis
    if 'AGE_GROUP' in df.columns:
        age_default = pd.crosstab(df['AGE_GROUP'], df[target_col], normalize='index') * 100
        insights.append("Age Group Default Rates:")
        for age_group in age_default.index:
            insights.append(f"  {age_group}: {age_default.loc[age_group, 1]:.1f}%")
    
    return insights

insights = generate_insights(app_data)
for insight in insights:
    print(insight)
Target Distribution: 91.9% Non-defaulters, 8.1% Defaulters
Gender Analysis: Female default rate: 7.0%, Male default rate: 10.1%
In [ ]: